From b702791c2cdd3af5677fd8a1999714e418724fd3 Mon Sep 17 00:00:00 2001 From: catlog22 Date: Tue, 16 Dec 2025 21:38:27 +0800 Subject: [PATCH] Remove LLM enhancement features and related components as per user request. This includes the deletion of source code files, CLI commands, front-end components, tests, scripts, and documentation associated with LLM functionality. Simplified dependencies and reduced complexity while retaining core vector search capabilities. Validation of changes confirmed successful removal and functionality. --- .claude/agents/cli-explore-agent.md | 2 +- .claude/rules/active_memory.md | 13 - .../dashboard-js/components/cli-status.js | 655 +------- ccw/src/templates/dashboard-js/i18n.js | 50 - .../dashboard-js/views/cli-manager.js | 6 +- codex-lens/docs/CLI_INTEGRATION_SUMMARY.md | 316 ---- .../docs/DOCSTRING_LLM_HYBRID_DESIGN.md | 972 ------------ codex-lens/docs/IMPLEMENTATION_SUMMARY.md | 64 +- codex-lens/docs/LLM_ENHANCED_SEARCH_GUIDE.md | 463 ------ .../docs/LLM_ENHANCEMENT_TEST_RESULTS.md | 232 --- codex-lens/docs/LLM_REMOVAL_SUMMARY.md | 342 +++++ .../docs/MISLEADING_COMMENTS_TEST_RESULTS.md | 301 ---- codex-lens/scripts/compare_search_methods.py | 465 ------ codex-lens/scripts/inspect_llm_summaries.py | 88 -- codex-lens/scripts/show_llm_analysis.py | 112 -- .../scripts/test_misleading_comments.py | 491 ------ codex-lens/src/codexlens/cli/commands.py | 178 --- codex-lens/src/codexlens/semantic/__init__.py | 30 - .../src/codexlens/semantic/llm_enhancer.py | 899 ----------- codex-lens/tests/test_llm_enhanced_search.py | 545 ------- codex-lens/tests/test_llm_enhancer.py | 1344 ----------------- 21 files changed, 375 insertions(+), 7193 deletions(-) delete mode 100644 .claude/rules/active_memory.md delete mode 100644 codex-lens/docs/CLI_INTEGRATION_SUMMARY.md delete mode 100644 codex-lens/docs/DOCSTRING_LLM_HYBRID_DESIGN.md delete mode 100644 codex-lens/docs/LLM_ENHANCED_SEARCH_GUIDE.md delete mode 100644 codex-lens/docs/LLM_ENHANCEMENT_TEST_RESULTS.md create mode 100644 codex-lens/docs/LLM_REMOVAL_SUMMARY.md delete mode 100644 codex-lens/docs/MISLEADING_COMMENTS_TEST_RESULTS.md delete mode 100644 codex-lens/scripts/compare_search_methods.py delete mode 100644 codex-lens/scripts/inspect_llm_summaries.py delete mode 100644 codex-lens/scripts/show_llm_analysis.py delete mode 100644 codex-lens/scripts/test_misleading_comments.py delete mode 100644 codex-lens/src/codexlens/semantic/llm_enhancer.py delete mode 100644 codex-lens/tests/test_llm_enhanced_search.py delete mode 100644 codex-lens/tests/test_llm_enhancer.py diff --git a/.claude/agents/cli-explore-agent.md b/.claude/agents/cli-explore-agent.md index 7ba68cb1..592f1399 100644 --- a/.claude/agents/cli-explore-agent.md +++ b/.claude/agents/cli-explore-agent.md @@ -85,7 +85,7 @@ MODE: analysis CONTEXT: @**/* EXPECTED: {from prompt} RULES: {from prompt, if template specified} | analysis=READ-ONLY -" --tool gemini --cd {dir} +" --tool gemini --cd {dir} ``` **Fallback Chain**: Gemini → Qwen → Codex → Bash-only diff --git a/.claude/rules/active_memory.md b/.claude/rules/active_memory.md deleted file mode 100644 index a3effdea..00000000 --- a/.claude/rules/active_memory.md +++ /dev/null @@ -1,13 +0,0 @@ -# Active Memory - -> Auto-generated understanding of frequently accessed files using GEMINI. -> Last updated: 2025-12-14T08:59:41.526Z -> Files analyzed: 10 -> CLI Tool: gemini - ---- - -[object Object] - ---- - diff --git a/ccw/src/templates/dashboard-js/components/cli-status.js b/ccw/src/templates/dashboard-js/components/cli-status.js index fcf68bb9..a2382d28 100644 --- a/ccw/src/templates/dashboard-js/components/cli-status.js +++ b/ccw/src/templates/dashboard-js/components/cli-status.js @@ -18,15 +18,6 @@ let nativeResumeEnabled = localStorage.getItem('ccw-native-resume') !== 'false'; // Recursive Query settings (for hierarchical storage aggregation) let recursiveQueryEnabled = localStorage.getItem('ccw-recursive-query') !== 'false'; // default true -// LLM Enhancement settings for Semantic Search -let llmEnhancementSettings = { - enabled: localStorage.getItem('ccw-llm-enhancement-enabled') === 'true', - tool: localStorage.getItem('ccw-llm-enhancement-tool') || 'gemini', - fallbackTool: localStorage.getItem('ccw-llm-enhancement-fallback') || 'qwen', - batchSize: parseInt(localStorage.getItem('ccw-llm-enhancement-batch-size') || '5', 10), - timeoutMs: parseInt(localStorage.getItem('ccw-llm-enhancement-timeout') || '300000', 10) -}; - // ========== Initialization ========== function initCliStatus() { // Load all statuses in one call using aggregated endpoint @@ -242,17 +233,12 @@ function renderCliStatus() { `; // Semantic Search card (only show if CodexLens is installed) - const llmStatusBadge = llmEnhancementSettings.enabled - ? `LLM` - : ''; const semanticHtml = codexLensStatus.ready ? ` -
+
Semantic Search AI - ${llmStatusBadge}
${semanticStatus.available ? 'AI-powered code understanding' : 'Natural language code search'} @@ -265,27 +251,17 @@ function renderCliStatus() {
${!semanticStatus.available ? ` - -
-
- - ~130MB -
- +
+ + ~130MB
` : ` -
-
- - bge-small-en-v1.5 -
- +
+ + bge-small-en-v1.5
`}
@@ -991,618 +967,3 @@ async function startSemanticInstall() { } } -// ========== Semantic Search Settings Modal ========== -function openSemanticSettingsModal() { - const availableTools = Object.entries(cliToolStatus) - .filter(function(entry) { return entry[1].available; }) - .map(function(entry) { return entry[0]; }); - - const modal = document.createElement('div'); - modal.id = 'semanticSettingsModal'; - modal.className = 'fixed inset-0 bg-black/50 flex items-center justify-center z-50'; - modal.onclick = function(e) { if (e.target === modal) closeSemanticSettingsModal(); }; - - const toolOptions = availableTools.map(function(tool) { - return ''; - }).join(''); - - const fallbackOptions = '' + availableTools.map(function(tool) { - return ''; - }).join(''); - - const disabled = !llmEnhancementSettings.enabled ? 'disabled' : ''; - const opacityClass = !llmEnhancementSettings.enabled ? 'opacity-50' : ''; - - modal.innerHTML = - '
' + - '
' + - '
' + - '
' + - '' + - '
' + - '
' + - '

' + t('semantic.settings') + '

' + - '

' + t('semantic.configDesc') + '

' + - '
' + - '
' + - '
' + - '
' + - '
' + - '

' + - '' + t('semantic.llmEnhancement') + '

' + - '

' + t('semantic.llmDesc') + '

' + - '
' + - '' + - '
' + - '
' + - '
' + - '
' + - '' + - '' + - '
' + - '
' + - '' + - '' + - '
' + - '
' + - '
' + - '
' + - '' + - '' + - '
' + - '
' + - '' + - '' + - '
' + - '
' + - '
' + - '
' + - '
' + - '' + - '
' + - '

' + t('semantic.enhanceInfo') + '

' + - '

' + t('semantic.enhanceCommand') + ' codex-lens enhance ' + t('semantic.enhanceAfterEnable') + '

' + - '
' + - '
' + - '
' + - '
' + - '' + - '' + - '
' + - '
' + - '
' + - '

' + - '' + t('semantic.testSearch') + '

' + - '
' + - '
' + - '' + - '
' + - '
' + - '' + - '
' + - '' + - '
' + - '
' + - '
' + - '
' + - '
' + - '' + - '
' + - '
'; - - document.body.appendChild(modal); - - // Add semantic search button handler - setTimeout(function() { - var runSemanticSearchBtn = document.getElementById('runSemanticSearchBtn'); - if (runSemanticSearchBtn) { - runSemanticSearchBtn.onclick = async function() { - var query = document.getElementById('semanticSearchInput').value.trim(); - var resultsDiv = document.getElementById('semanticSearchResults'); - var resultCount = document.getElementById('semanticResultCount'); - var resultContent = document.getElementById('semanticResultContent'); - - if (!query) { - showRefreshToast(t('codexlens.enterQuery'), 'warning'); - return; - } - - runSemanticSearchBtn.disabled = true; - runSemanticSearchBtn.innerHTML = '' + t('codexlens.searching') + ''; - resultsDiv.classList.add('hidden'); - - try { - var params = new URLSearchParams({ - query: query, - mode: 'semantic', - limit: '10' - }); - - var response = await fetch('/api/codexlens/search?' + params.toString()); - var result = await response.json(); - - console.log('[Semantic Search Test] Result:', result); - - if (result.success) { - var results = result.results || []; - resultCount.textContent = results.length + ' ' + t('codexlens.resultsCount'); - resultContent.textContent = JSON.stringify(results, null, 2); - resultsDiv.classList.remove('hidden'); - showRefreshToast(t('codexlens.searchCompleted') + ': ' + results.length + ' ' + t('codexlens.resultsCount'), 'success'); - } else { - resultContent.textContent = t('common.error') + ': ' + (result.error || t('common.unknownError')); - resultsDiv.classList.remove('hidden'); - showRefreshToast(t('codexlens.searchFailed') + ': ' + result.error, 'error'); - } - - runSemanticSearchBtn.disabled = false; - runSemanticSearchBtn.innerHTML = ' ' + t('semantic.runSearch'); - if (window.lucide) lucide.createIcons(); - } catch (err) { - console.error('[Semantic Search Test] Error:', err); - resultContent.textContent = t('common.exception') + ': ' + err.message; - resultsDiv.classList.remove('hidden'); - showRefreshToast(t('common.error') + ': ' + err.message, 'error'); - runSemanticSearchBtn.disabled = false; - runSemanticSearchBtn.innerHTML = ' ' + t('semantic.runSearch'); - if (window.lucide) lucide.createIcons(); - } - }; - } - }, 100); - - var handleEscape = function(e) { - if (e.key === 'Escape') { - closeSemanticSettingsModal(); - document.removeEventListener('keydown', handleEscape); - } - }; - document.addEventListener('keydown', handleEscape); - - if (window.lucide) { - lucide.createIcons(); - } -} - -function closeSemanticSettingsModal() { - var modal = document.getElementById('semanticSettingsModal'); - if (modal) modal.remove(); -} - -function toggleLlmEnhancement(enabled) { - llmEnhancementSettings.enabled = enabled; - localStorage.setItem('ccw-llm-enhancement-enabled', enabled.toString()); - - var settingsSection = document.getElementById('llmSettingsSection'); - if (settingsSection) { - settingsSection.classList.toggle('opacity-50', !enabled); - settingsSection.querySelectorAll('select').forEach(function(el) { el.disabled = !enabled; }); - } - - renderCliStatus(); - showRefreshToast(t('semantic.llmEnhancement') + ' ' + (enabled ? t('semantic.enabled') : t('semantic.disabled')), 'success'); -} - -function updateLlmTool(tool) { - llmEnhancementSettings.tool = tool; - localStorage.setItem('ccw-llm-enhancement-tool', tool); - showRefreshToast(t('semantic.toolSetTo') + ' ' + tool, 'success'); -} - -function updateLlmFallback(tool) { - llmEnhancementSettings.fallbackTool = tool; - localStorage.setItem('ccw-llm-enhancement-fallback', tool); - showRefreshToast(t('semantic.fallbackSetTo') + ' ' + (tool || t('semantic.none')), 'success'); -} - -function updateLlmBatchSize(size) { - llmEnhancementSettings.batchSize = parseInt(size, 10); - localStorage.setItem('ccw-llm-enhancement-batch-size', size); - showRefreshToast(t('semantic.batchSetTo') + ' ' + size + ' ' + t('semantic.files'), 'success'); -} - -function updateLlmTimeout(ms) { - llmEnhancementSettings.timeoutMs = parseInt(ms, 10); - localStorage.setItem('ccw-llm-enhancement-timeout', ms); - var mins = parseInt(ms, 10) / 60000; - showRefreshToast(t('semantic.timeoutSetTo') + ' ' + mins + ' ' + (mins > 1 ? t('semantic.minutes') : t('semantic.minute')), 'success'); -} - -async function runEnhanceCommand() { - if (!llmEnhancementSettings.enabled) { - showRefreshToast(t('semantic.enableFirst'), 'warning'); - return; - } - - showRefreshToast('Starting LLM enhancement...', 'info'); - closeSemanticSettingsModal(); - - try { - var response = await fetch('/api/codexlens/enhance', { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ - path: projectPath, - tool: llmEnhancementSettings.tool, - batchSize: llmEnhancementSettings.batchSize, - timeoutMs: llmEnhancementSettings.timeoutMs - }) - }); - - var result = await response.json(); - if (result.success) { - var enhanced = result.result?.enhanced || 0; - showRefreshToast('Enhanced ' + enhanced + ' files with LLM', 'success'); - } else { - showRefreshToast('Enhance failed: ' + result.error, 'error'); - } - } catch (err) { - showRefreshToast('Enhance error: ' + err.message, 'error'); - } -} - -function viewEnhanceStatus() { - openSemanticMetadataViewer(); -} - -// ========== Semantic Metadata Viewer ========== -var semanticMetadataCache = { - entries: [], - total: 0, - offset: 0, - limit: 50, - loading: false -}; - -async function openSemanticMetadataViewer() { - closeSemanticSettingsModal(); - - var modal = document.createElement('div'); - modal.id = 'semanticMetadataModal'; - modal.className = 'generic-modal-overlay'; - modal.onclick = function(e) { if (e.target === modal) closeSemanticMetadataViewer(); }; - - modal.innerHTML = - '
' + - '
' + - '
' + - '' + - '

Semantic Metadata Browser

' + - 'Loading...' + - '
' + - '' + - '
' + - '
' + - '
' + - '
' + - '' + - '' + - '
' + - '
' + - '-' + - '
' + - '
' + - '
' + - '
' + - '
' + - 'Loading metadata...' + - '
' + - '
' + - '' + - '
' + - '
'; - - document.body.appendChild(modal); - - requestAnimationFrame(function() { - modal.classList.add('active'); - }); - - var handleEscape = function(e) { - if (e.key === 'Escape') { - closeSemanticMetadataViewer(); - document.removeEventListener('keydown', handleEscape); - } - }; - document.addEventListener('keydown', handleEscape); - - if (window.lucide) { - lucide.createIcons(); - } - - await loadSemanticMetadata(); -} - -function closeSemanticMetadataViewer() { - var modal = document.getElementById('semanticMetadataModal'); - if (modal) { - modal.classList.remove('active'); - setTimeout(function() { modal.remove(); }, 200); - } -} - -async function loadSemanticMetadata(offset, toolFilter) { - offset = typeof offset === 'number' ? offset : semanticMetadataCache.offset; - toolFilter = toolFilter !== undefined ? toolFilter : (document.getElementById('semanticToolFilter')?.value || ''); - - semanticMetadataCache.loading = true; - - var container = document.getElementById('semanticMetadataTableContainer'); - if (container) { - container.innerHTML = - '
' + - '
' + - 'Loading metadata...' + - '
'; - } - - try { - var url = '/api/codexlens/semantic/metadata?offset=' + offset + '&limit=' + semanticMetadataCache.limit; - if (toolFilter) { - url += '&tool=' + encodeURIComponent(toolFilter); - } - - var response = await fetch(url); - var data = await response.json(); - - if (data.success && data.result) { - semanticMetadataCache.entries = data.result.entries || []; - semanticMetadataCache.total = data.result.total || 0; - semanticMetadataCache.offset = offset; - - renderSemanticMetadataTable(); - updateSemanticPagination(); - } else { - container.innerHTML = - '
' + - '' + - '

Error loading metadata: ' + (data.error || 'Unknown error') + '

' + - '
'; - if (window.lucide) lucide.createIcons(); - } - } catch (err) { - container.innerHTML = - '
' + - '' + - '

Error: ' + err.message + '

' + - '
'; - if (window.lucide) lucide.createIcons(); - } - - semanticMetadataCache.loading = false; -} - -function escapeHtmlSemantic(text) { - if (!text) return ''; - var div = document.createElement('div'); - div.textContent = text; - return div.innerHTML; -} - -function renderSemanticMetadataTable() { - var container = document.getElementById('semanticMetadataTableContainer'); - if (!container) return; - - var entries = semanticMetadataCache.entries; - - if (!entries.length) { - container.innerHTML = - '
' + - '' + - '

No semantic metadata found

' + - '

Run \'codex-lens enhance\' to generate metadata for indexed files.

' + - '' + - '
'; - if (window.lucide) lucide.createIcons(); - return; - } - - var rows = entries.map(function(entry, idx) { - var keywordsHtml = (entry.keywords || []).slice(0, 4).map(function(k) { - return '' + escapeHtmlSemantic(k) + ''; - }).join(''); - if ((entry.keywords || []).length > 4) { - keywordsHtml += '+' + (entry.keywords.length - 4) + ''; - } - - var date = entry.generated_at ? new Date(entry.generated_at * 1000).toLocaleDateString() : '-'; - - return ( - '' + - '' + - '
' + - '' + - '' + escapeHtmlSemantic(entry.file_name || '-') + '' + - '
' + - '
' + - escapeHtmlSemantic(entry.full_path || '-') + - '
' + - '' + - '' + escapeHtmlSemantic(entry.language || '-') + '' + - '' + escapeHtmlSemantic((entry.purpose || '-').substring(0, 50)) + - ((entry.purpose || '').length > 50 ? '...' : '') + '' + - '' + (keywordsHtml || '-') + '' + - '' + - '' + - escapeHtmlSemantic(entry.llm_tool || '-') + - '' + - '' + - '' + date + '' + - '' + - '' + - '' + - '
' + - '
' + - '

Summary

' + - '

' + escapeHtmlSemantic(entry.summary || 'No summary available') + '

' + - '
' + - '
' + - '

All Keywords

' + - '
' + - (entry.keywords || []).map(function(k) { - return '' + escapeHtmlSemantic(k) + ''; - }).join('') + - '
' + - '
' + - '
' + - ' ' + (entry.line_count || 0) + ' lines' + - ' ' + escapeHtmlSemantic(entry.llm_tool || 'Unknown') + '' + - ' ' + date + '' + - '
' + - '
' + - '' + - '' - ); - }).join(''); - - container.innerHTML = - '' + - '' + - '' + - '' + - '' + - '' + - '' + - '' + - '' + - '' + - '' + - '' + rows + '' + - '
FileLanguagePurposeKeywordsToolDate
'; - - if (window.lucide) lucide.createIcons(); -} - -function toggleSemanticDetail(idx) { - var detailRow = document.getElementById('semanticDetail' + idx); - if (detailRow) { - detailRow.classList.toggle('hidden'); - if (window.lucide) lucide.createIcons(); - } -} - -function updateSemanticPagination() { - var total = semanticMetadataCache.total; - var offset = semanticMetadataCache.offset; - var limit = semanticMetadataCache.limit; - var entries = semanticMetadataCache.entries; - - var countBadge = document.getElementById('semanticMetadataCount'); - if (countBadge) { - countBadge.textContent = total + ' entries'; - } - - var paginationInfo = document.getElementById('semanticPaginationInfo'); - if (paginationInfo) { - if (total > 0) { - paginationInfo.textContent = (offset + 1) + '-' + (offset + entries.length) + ' of ' + total; - } else { - paginationInfo.textContent = 'No entries'; - } - } - - var pageSelect = document.getElementById('semanticPageSelect'); - if (pageSelect) { - var totalPages = Math.ceil(total / limit) || 1; - var currentPage = Math.floor(offset / limit); - - pageSelect.innerHTML = ''; - for (var i = 0; i < totalPages; i++) { - var opt = document.createElement('option'); - opt.value = i; - opt.textContent = i + 1; - if (i === currentPage) opt.selected = true; - pageSelect.appendChild(opt); - } - } - - var prevBtn = document.getElementById('semanticPrevBtn'); - var nextBtn = document.getElementById('semanticNextBtn'); - if (prevBtn) prevBtn.disabled = offset === 0; - if (nextBtn) nextBtn.disabled = offset + limit >= total; -} - -function semanticPrevPage() { - if (semanticMetadataCache.offset > 0) { - loadSemanticMetadata(Math.max(0, semanticMetadataCache.offset - semanticMetadataCache.limit)); - } -} - -function semanticNextPage() { - if (semanticMetadataCache.offset + semanticMetadataCache.limit < semanticMetadataCache.total) { - loadSemanticMetadata(semanticMetadataCache.offset + semanticMetadataCache.limit); - } -} - -function semanticGoToPage(pageIndex) { - var offset = parseInt(pageIndex, 10) * semanticMetadataCache.limit; - loadSemanticMetadata(offset); -} - -function filterSemanticByTool(tool) { - loadSemanticMetadata(0, tool); -} - -function refreshSemanticMetadata() { - loadSemanticMetadata(semanticMetadataCache.offset); -} - -function getLlmEnhancementSettings() { - return Object.assign({}, llmEnhancementSettings); -} diff --git a/ccw/src/templates/dashboard-js/i18n.js b/ccw/src/templates/dashboard-js/i18n.js index d11ad4a8..f3fbb28e 100644 --- a/ccw/src/templates/dashboard-js/i18n.js +++ b/ccw/src/templates/dashboard-js/i18n.js @@ -277,35 +277,10 @@ const i18n = { // Semantic Search Configuration 'semantic.settings': 'Semantic Search Settings', - 'semantic.configDesc': 'Configure LLM enhancement for semantic indexing', - 'semantic.llmEnhancement': 'LLM Enhancement', - 'semantic.llmDesc': 'Use LLM to generate code summaries for better semantic search', - 'semantic.primaryTool': 'Primary LLM Tool', - 'semantic.fallbackTool': 'Fallback Tool', - 'semantic.batchSize': 'Batch Size', - 'semantic.timeout': 'Timeout', - 'semantic.file': 'file', - 'semantic.files': 'files', - 'semantic.enhanceInfo': 'LLM enhancement generates code summaries and keywords for each file, improving semantic search accuracy.', - 'semantic.enhanceCommand': 'Run', - 'semantic.enhanceAfterEnable': 'after enabling to process existing files.', - 'semantic.runEnhanceNow': 'Run Enhance Now', - 'semantic.viewStatus': 'View Status', 'semantic.testSearch': 'Test Semantic Search', 'semantic.searchPlaceholder': 'Enter semantic query (e.g., authentication logic, error handling)', 'semantic.runSearch': 'Run Semantic Search', 'semantic.close': 'Close', - 'semantic.enabled': 'enabled', - 'semantic.disabled': 'disabled', - 'semantic.toolSetTo': 'Primary LLM tool set to', - 'semantic.fallbackSetTo': 'Fallback tool set to', - 'semantic.none': 'none', - 'semantic.llmEnhancement': 'LLM Enhancement', - 'semantic.batchSetTo': 'Batch size set to', - 'semantic.timeoutSetTo': 'Timeout set to', - 'semantic.minute': 'minute', - 'semantic.minutes': 'minutes', - 'semantic.enableFirst': 'Please enable LLM Enhancement first', 'cli.settings': 'CLI Execution Settings', 'cli.promptFormat': 'Prompt Format', @@ -1407,35 +1382,10 @@ const i18n = { // Semantic Search 配置 'semantic.settings': '语义搜索设置', - 'semantic.configDesc': '配置语义索引的 LLM 增强功能', - 'semantic.llmEnhancement': 'LLM 增强', - 'semantic.llmDesc': '使用 LLM 生成代码摘要以改进语义搜索', - 'semantic.primaryTool': '主 LLM 工具', - 'semantic.fallbackTool': '备用工具', - 'semantic.batchSize': '批处理大小', - 'semantic.timeout': '超时时间', - 'semantic.file': '个文件', - 'semantic.files': '个文件', - 'semantic.enhanceInfo': 'LLM 增强为每个文件生成代码摘要和关键词,提高语义搜索准确度。', - 'semantic.enhanceCommand': '运行', - 'semantic.enhanceAfterEnable': '启用后处理现有文件。', - 'semantic.runEnhanceNow': '立即运行增强', - 'semantic.viewStatus': '查看状态', 'semantic.testSearch': '测试语义搜索', 'semantic.searchPlaceholder': '输入语义查询(例如:身份验证逻辑、错误处理)', 'semantic.runSearch': '运行语义搜索', 'semantic.close': '关闭', - 'semantic.enabled': '已启用', - 'semantic.disabled': '已禁用', - 'semantic.toolSetTo': '主 LLM 工具已设置为', - 'semantic.fallbackSetTo': '备用工具已设置为', - 'semantic.none': '无', - 'semantic.llmEnhancement': 'LLM 增强', - 'semantic.batchSetTo': '批量大小已设置为', - 'semantic.timeoutSetTo': '超时已设置为', - 'semantic.minute': '分钟', - 'semantic.minutes': '分钟', - 'semantic.enableFirst': '请先启用 LLM 增强', 'cli.settings': 'CLI 调用设置', 'cli.promptFormat': '提示词格式', diff --git a/ccw/src/templates/dashboard-js/views/cli-manager.js b/ccw/src/templates/dashboard-js/views/cli-manager.js index 6905b042..4180a21c 100644 --- a/ccw/src/templates/dashboard-js/views/cli-manager.js +++ b/ccw/src/templates/dashboard-js/views/cli-manager.js @@ -397,13 +397,11 @@ function renderToolsSection() { // Semantic Search item (only show if CodexLens is installed) var semanticHtml = ''; if (codexLensStatus.ready) { - semanticHtml = '
' + + semanticHtml = '
' + '
' + '' + '
' + - '
Semantic Search AI' + - (llmEnhancementSettings.enabled ? 'LLM' : '') + - '
' + + '
Semantic Search AI
' + '
' + (semanticStatus.available ? 'AI-powered code understanding' : 'Natural language code search') + '
' + '
' + '
' + diff --git a/codex-lens/docs/CLI_INTEGRATION_SUMMARY.md b/codex-lens/docs/CLI_INTEGRATION_SUMMARY.md deleted file mode 100644 index 793d922f..00000000 --- a/codex-lens/docs/CLI_INTEGRATION_SUMMARY.md +++ /dev/null @@ -1,316 +0,0 @@ -# CLI Integration Summary - Embedding Management - -**Date**: 2025-12-16 -**Version**: v0.5.1 -**Status**: ✅ Complete - ---- - -## Overview - -Completed integration of embedding management commands into the CodexLens CLI, making vector search functionality more accessible and user-friendly. Users no longer need to run standalone scripts - all embedding operations are now available through simple CLI commands. - -## What Changed - -### 1. New CLI Commands - -#### `codexlens embeddings-generate` - -**Purpose**: Generate semantic embeddings for code search - -**Features**: -- Accepts project directory or direct `_index.db` path -- Auto-finds index for project paths using registry -- Supports 4 model profiles (fast, code, multilingual, balanced) -- Force regeneration with `--force` flag -- Configurable chunk size -- Verbose mode with progress updates -- JSON output mode for scripting - -**Examples**: -```bash -# Generate embeddings for a project -codexlens embeddings-generate ~/projects/my-app - -# Use specific model -codexlens embeddings-generate ~/projects/my-app --model fast - -# Force regeneration -codexlens embeddings-generate ~/projects/my-app --force - -# Verbose output -codexlens embeddings-generate ~/projects/my-app -v -``` - -**Output**: -``` -Generating embeddings -Index: ~/.codexlens/indexes/my-app/_index.db -Model: code - -✓ Embeddings generated successfully! - Model: jinaai/jina-embeddings-v2-base-code - Chunks created: 1,234 - Files processed: 89 - Time: 45.2s - -Use vector search with: - codexlens search 'your query' --mode pure-vector -``` - -#### `codexlens embeddings-status` - -**Purpose**: Check embedding status for indexes - -**Features**: -- Check all indexes (no arguments) -- Check specific project or index -- Summary table view -- File coverage statistics -- Missing files detection -- JSON output mode - -**Examples**: -```bash -# Check all indexes -codexlens embeddings-status - -# Check specific project -codexlens embeddings-status ~/projects/my-app - -# Check specific index -codexlens embeddings-status ~/.codexlens/indexes/my-app/_index.db -``` - -**Output (all indexes)**: -``` -Embedding Status Summary -Index root: ~/.codexlens/indexes - -Total indexes: 5 -Indexes with embeddings: 3/5 -Total chunks: 4,567 - -Project Files Chunks Coverage Status -my-app 89 1,234 100.0% ✓ -other-app 145 2,456 95.5% ✓ -test-proj 23 877 100.0% ✓ -no-emb 67 0 0.0% — -legacy 45 0 0.0% — -``` - -**Output (specific project)**: -``` -Embedding Status -Index: ~/.codexlens/indexes/my-app/_index.db - -✓ Embeddings available - Total chunks: 1,234 - Total files: 89 - Files with embeddings: 89/89 - Coverage: 100.0% -``` - -### 2. Improved Error Messages - -Enhanced error messages throughout the search pipeline to guide users to the new CLI commands: - -**Before**: -``` -DEBUG: No semantic_chunks table found -DEBUG: Vector store is empty -``` - -**After**: -``` -INFO: No embeddings found in index. Generate embeddings with: codexlens embeddings-generate ~/projects/my-app -WARNING: Pure vector search returned no results. This usually means embeddings haven't been generated. Run: codexlens embeddings-generate ~/projects/my-app -``` - -**Locations Updated**: -- `src/codexlens/search/hybrid_search.py` - Added helpful info messages -- `src/codexlens/cli/commands.py` - Improved error hints in CLI output - -### 3. Backend Infrastructure - -Created `src/codexlens/cli/embedding_manager.py` with reusable functions: - -**Functions**: -- `check_index_embeddings(index_path)` - Check embedding status -- `generate_embeddings(index_path, ...)` - Generate embeddings -- `find_all_indexes(scan_dir)` - Find all indexes in directory -- `get_embedding_stats_summary(index_root)` - Aggregate stats for all indexes - -**Architecture**: -- Follows same pattern as `model_manager.py` for consistency -- Returns standardized result dictionaries `{"success": bool, "result": dict}` -- Supports progress callbacks for UI updates -- Handles all error cases gracefully - -### 4. Documentation Updates - -Updated user-facing documentation to reference new CLI commands: - -**Files Updated**: -1. `docs/PURE_VECTOR_SEARCH_GUIDE.md` - - Changed all references from `python scripts/generate_embeddings.py` to `codexlens embeddings-generate` - - Updated troubleshooting section - - Added new `embeddings-status` examples - -2. `docs/IMPLEMENTATION_SUMMARY.md` - - Marked P1 priorities as complete - - Added CLI integration to checklist - - Updated feature list - -3. `src/codexlens/cli/commands.py` - - Updated search command help text to reference new commands - -## Files Created - -| File | Purpose | Lines | -|------|---------|-------| -| `src/codexlens/cli/embedding_manager.py` | Backend logic for embedding operations | ~290 | -| `docs/CLI_INTEGRATION_SUMMARY.md` | This document | ~400 | - -## Files Modified - -| File | Changes | -|------|---------| -| `src/codexlens/cli/commands.py` | Added 2 new commands (~270 lines) | -| `src/codexlens/search/hybrid_search.py` | Improved error messages (~20 lines) | -| `docs/PURE_VECTOR_SEARCH_GUIDE.md` | Updated CLI references (~10 changes) | -| `docs/IMPLEMENTATION_SUMMARY.md` | Marked P1 complete (~10 lines) | - -## Testing Workflow - -### Manual Testing Checklist - -- [ ] `codexlens embeddings-status` with no indexes -- [ ] `codexlens embeddings-status` with multiple indexes -- [ ] `codexlens embeddings-status ~/projects/my-app` (project path) -- [ ] `codexlens embeddings-status ~/.codexlens/indexes/my-app/_index.db` (direct path) -- [ ] `codexlens embeddings-generate ~/projects/my-app` (first time) -- [ ] `codexlens embeddings-generate ~/projects/my-app` (already exists, should error) -- [ ] `codexlens embeddings-generate ~/projects/my-app --force` (regenerate) -- [ ] `codexlens embeddings-generate ~/projects/my-app --model fast` -- [ ] `codexlens embeddings-generate ~/projects/my-app -v` (verbose output) -- [ ] `codexlens search "query" --mode pure-vector` (with embeddings) -- [ ] `codexlens search "query" --mode pure-vector` (without embeddings, check error message) -- [ ] `codexlens embeddings-status --json` (JSON output) -- [ ] `codexlens embeddings-generate ~/projects/my-app --json` (JSON output) - -### Expected Test Results - -**Without embeddings**: -```bash -$ codexlens embeddings-status ~/projects/my-app -Embedding Status -Index: ~/.codexlens/indexes/my-app/_index.db - -— No embeddings found - Total files indexed: 89 - -Generate embeddings with: - codexlens embeddings-generate ~/projects/my-app -``` - -**After generating embeddings**: -```bash -$ codexlens embeddings-generate ~/projects/my-app -Generating embeddings -Index: ~/.codexlens/indexes/my-app/_index.db -Model: code - -✓ Embeddings generated successfully! - Model: jinaai/jina-embeddings-v2-base-code - Chunks created: 1,234 - Files processed: 89 - Time: 45.2s -``` - -**Status after generation**: -```bash -$ codexlens embeddings-status ~/projects/my-app -Embedding Status -Index: ~/.codexlens/indexes/my-app/_index.db - -✓ Embeddings available - Total chunks: 1,234 - Total files: 89 - Files with embeddings: 89/89 - Coverage: 100.0% -``` - -**Pure vector search**: -```bash -$ codexlens search "how to authenticate users" --mode pure-vector -Found 5 results in 12.3ms: - -auth/authentication.py:42 [0.876] - def authenticate_user(username: str, password: str) -> bool: - '''Verify user credentials against database.''' - return check_password(username, password) -... -``` - -## User Experience Improvements - -| Before | After | -|--------|-------| -| Run separate Python script | Single CLI command | -| Manual path resolution | Auto-finds project index | -| No status check | `embeddings-status` command | -| Generic error messages | Helpful hints with commands | -| Script-level documentation | Integrated `--help` text | - -## Backward Compatibility - -- ✅ Standalone script `scripts/generate_embeddings.py` still works -- ✅ All existing search modes unchanged -- ✅ Pure vector implementation backward compatible -- ✅ No breaking changes to APIs - -## Next Steps (Optional) - -Future enhancements users might want: - -1. **Batch operations**: - ```bash - codexlens embeddings-generate --all # Generate for all indexes - ``` - -2. **Incremental updates**: - ```bash - codexlens embeddings-update ~/projects/my-app # Only changed files - ``` - -3. **Embedding cleanup**: - ```bash - codexlens embeddings-delete ~/projects/my-app # Remove embeddings - ``` - -4. **Model management integration**: - ```bash - codexlens embeddings-generate ~/projects/my-app --download-model - ``` - ---- - -## Summary - -✅ **Completed**: Full CLI integration for embedding management -✅ **User Experience**: Simplified from multi-step script to single command -✅ **Error Handling**: Helpful messages guide users to correct commands -✅ **Documentation**: All references updated to new CLI commands -✅ **Testing**: Manual testing checklist prepared - -**Impact**: Users can now manage embeddings with intuitive CLI commands instead of running scripts, making vector search more accessible and easier to use. - -**Command Summary**: -```bash -codexlens embeddings-status [path] # Check status -codexlens embeddings-generate [--model] [--force] # Generate -codexlens search "query" --mode pure-vector # Use vector search -``` - -The integration is **complete and ready for testing**. diff --git a/codex-lens/docs/DOCSTRING_LLM_HYBRID_DESIGN.md b/codex-lens/docs/DOCSTRING_LLM_HYBRID_DESIGN.md deleted file mode 100644 index 8b7f17be..00000000 --- a/codex-lens/docs/DOCSTRING_LLM_HYBRID_DESIGN.md +++ /dev/null @@ -1,972 +0,0 @@ -# Docstring与LLM混合策略设计方案 - -## 1. 背景与目标 - -### 1.1 当前问题 - -现有 `llm_enhancer.py` 的实现存在以下问题: - -1. **忽略已有文档**:对所有代码无差别调用LLM,即使已有高质量的docstring -2. **成本浪费**:重复生成已有信息,增加API调用费用和时间 -3. **信息质量不一致**:LLM生成的内容可能不如作者编写的docstring准确 -4. **缺少作者意图**:丢失了docstring中的设计决策、使用示例等关键信息 - -### 1.2 设计目标 - -实现**智能混合策略**,结合docstring和LLM的优势: - -1. **优先使用docstring**:作为最权威的信息源 -2. **LLM作为补充**:填补docstring缺失或质量不足的部分 -3. **智能质量评估**:自动判断docstring质量,决定是否需要LLM增强 -4. **成本优化**:减少不必要的LLM调用,降低API费用 -5. **信息融合**:将docstring和LLM生成的内容有机结合 - -## 2. 技术架构 - -### 2.1 整体流程 - -``` -Code Symbol - ↓ -[Docstring Extractor] ← 提取docstring - ↓ -[Quality Evaluator] ← 评估docstring质量 - ↓ - ├─ High Quality → Use Docstring Directly - │ + LLM Generate Keywords Only - │ - ├─ Medium Quality → LLM Refine & Enhance - │ (docstring作为base) - │ - └─ Low/No Docstring → LLM Full Generation - (现有流程) - ↓ -[Metadata Merger] ← 合并docstring和LLM内容 - ↓ -Final SemanticMetadata -``` - -### 2.2 核心组件 - -```python -from dataclasses import dataclass -from enum import Enum -from typing import Optional - -class DocstringQuality(Enum): - """Docstring质量等级""" - MISSING = "missing" # 无docstring - LOW = "low" # 质量低:<10字符或纯占位符 - MEDIUM = "medium" # 质量中:有基本描述但不完整 - HIGH = "high" # 质量高:详细且结构化 - -@dataclass -class DocstringMetadata: - """从docstring提取的元数据""" - raw_text: str - quality: DocstringQuality - summary: Optional[str] = None # 提取的摘要 - parameters: Optional[dict] = None # 参数说明 - returns: Optional[str] = None # 返回值说明 - examples: Optional[str] = None # 使用示例 - notes: Optional[str] = None # 注意事项 -``` - -## 3. 详细实现步骤 - -### 3.1 Docstring提取与解析 - -```python -import re -from typing import Optional - -class DocstringExtractor: - """Docstring提取器""" - - # Docstring风格正则 - GOOGLE_STYLE_PATTERN = re.compile( - r'Args:|Returns:|Raises:|Examples:|Note:', - re.MULTILINE - ) - - NUMPY_STYLE_PATTERN = re.compile( - r'Parameters\n-+|Returns\n-+|Examples\n-+', - re.MULTILINE - ) - - def extract_from_code(self, content: str, symbol: Symbol) -> Optional[str]: - """从代码中提取docstring""" - - lines = content.splitlines() - start_line = symbol.range[0] - 1 # 0-indexed - - # 查找函数定义后的第一个字符串字面量 - # 通常在函数定义的下一行或几行内 - for i in range(start_line + 1, min(start_line + 10, len(lines))): - line = lines[i].strip() - - # Python triple-quoted string - if line.startswith('"""') or line.startswith("'''"): - return self._extract_multiline_docstring(lines, i) - - return None - - def _extract_multiline_docstring( - self, - lines: List[str], - start_idx: int - ) -> str: - """提取多行docstring""" - - quote_char = '"""' if lines[start_idx].strip().startswith('"""') else "'''" - docstring_lines = [] - - # 检查是否单行docstring - first_line = lines[start_idx].strip() - if first_line.count(quote_char) == 2: - # 单行: """This is a docstring.""" - return first_line.strip(quote_char).strip() - - # 多行docstring - in_docstring = True - for i in range(start_idx, len(lines)): - line = lines[i] - - if i == start_idx: - # 第一行:移除开始的引号 - docstring_lines.append(line.strip().lstrip(quote_char)) - elif quote_char in line: - # 结束行:移除结束的引号 - docstring_lines.append(line.strip().rstrip(quote_char)) - break - else: - docstring_lines.append(line.strip()) - - return '\n'.join(docstring_lines).strip() - - def parse_docstring(self, raw_docstring: str) -> DocstringMetadata: - """解析docstring,提取结构化信息""" - - if not raw_docstring: - return DocstringMetadata( - raw_text="", - quality=DocstringQuality.MISSING - ) - - # 评估质量 - quality = self._evaluate_quality(raw_docstring) - - # 提取各个部分 - metadata = DocstringMetadata( - raw_text=raw_docstring, - quality=quality, - ) - - # 提取摘要(第一行或第一段) - metadata.summary = self._extract_summary(raw_docstring) - - # 如果是Google或NumPy风格,提取结构化内容 - if self.GOOGLE_STYLE_PATTERN.search(raw_docstring): - self._parse_google_style(raw_docstring, metadata) - elif self.NUMPY_STYLE_PATTERN.search(raw_docstring): - self._parse_numpy_style(raw_docstring, metadata) - - return metadata - - def _evaluate_quality(self, docstring: str) -> DocstringQuality: - """评估docstring质量""" - - if not docstring or len(docstring.strip()) == 0: - return DocstringQuality.MISSING - - # 检查是否是占位符 - placeholders = ['todo', 'fixme', 'tbd', 'placeholder', '...'] - if any(p in docstring.lower() for p in placeholders): - return DocstringQuality.LOW - - # 长度检查 - if len(docstring.strip()) < 10: - return DocstringQuality.LOW - - # 检查是否有结构化内容 - has_structure = ( - self.GOOGLE_STYLE_PATTERN.search(docstring) or - self.NUMPY_STYLE_PATTERN.search(docstring) - ) - - # 检查是否有足够的描述性文本 - word_count = len(docstring.split()) - - if has_structure and word_count >= 20: - return DocstringQuality.HIGH - elif word_count >= 10: - return DocstringQuality.MEDIUM - else: - return DocstringQuality.LOW - - def _extract_summary(self, docstring: str) -> str: - """提取摘要(第一行或第一段)""" - - lines = docstring.split('\n') - # 第一行非空行作为摘要 - for line in lines: - if line.strip(): - return line.strip() - - return "" - - def _parse_google_style(self, docstring: str, metadata: DocstringMetadata): - """解析Google风格docstring""" - - # 提取Args - args_match = re.search(r'Args:(.*?)(?=Returns:|Raises:|Examples:|Note:|\Z)', docstring, re.DOTALL) - if args_match: - metadata.parameters = self._parse_args_section(args_match.group(1)) - - # 提取Returns - returns_match = re.search(r'Returns:(.*?)(?=Raises:|Examples:|Note:|\Z)', docstring, re.DOTALL) - if returns_match: - metadata.returns = returns_match.group(1).strip() - - # 提取Examples - examples_match = re.search(r'Examples:(.*?)(?=Note:|\Z)', docstring, re.DOTALL) - if examples_match: - metadata.examples = examples_match.group(1).strip() - - def _parse_args_section(self, args_text: str) -> dict: - """解析参数列表""" - - params = {} - # 匹配 "param_name (type): description" 或 "param_name: description" - pattern = re.compile(r'(\w+)\s*(?:\(([^)]+)\))?\s*:\s*(.+)') - - for line in args_text.split('\n'): - match = pattern.search(line.strip()) - if match: - param_name, param_type, description = match.groups() - params[param_name] = { - 'type': param_type, - 'description': description.strip() - } - - return params -``` - -### 3.2 智能混合策略引擎 - -```python -class HybridEnhancer: - """Docstring与LLM混合增强器""" - - def __init__( - self, - llm_enhancer: LLMEnhancer, - docstring_extractor: DocstringExtractor - ): - self.llm_enhancer = llm_enhancer - self.docstring_extractor = docstring_extractor - - def enhance_with_strategy( - self, - file_data: FileData, - symbols: List[Symbol] - ) -> Dict[str, SemanticMetadata]: - """根据docstring质量选择增强策略""" - - results = {} - - for symbol in symbols: - # 1. 提取并解析docstring - raw_docstring = self.docstring_extractor.extract_from_code( - file_data.content, symbol - ) - doc_metadata = self.docstring_extractor.parse_docstring(raw_docstring or "") - - # 2. 根据质量选择策略 - semantic_metadata = self._apply_strategy( - file_data, symbol, doc_metadata - ) - - results[symbol.name] = semantic_metadata - - return results - - def _apply_strategy( - self, - file_data: FileData, - symbol: Symbol, - doc_metadata: DocstringMetadata - ) -> SemanticMetadata: - """应用混合策略""" - - quality = doc_metadata.quality - - if quality == DocstringQuality.HIGH: - # 高质量:直接使用docstring,只用LLM生成keywords - return self._use_docstring_with_llm_keywords(symbol, doc_metadata) - - elif quality == DocstringQuality.MEDIUM: - # 中等质量:让LLM精炼和增强 - return self._refine_with_llm(file_data, symbol, doc_metadata) - - else: # LOW or MISSING - # 低质量或无:完全由LLM生成 - return self._full_llm_generation(file_data, symbol) - - def _use_docstring_with_llm_keywords( - self, - symbol: Symbol, - doc_metadata: DocstringMetadata - ) -> SemanticMetadata: - """策略1:使用docstring,LLM只生成keywords""" - - # 直接使用docstring的摘要 - summary = doc_metadata.summary or doc_metadata.raw_text[:200] - - # 使用LLM生成keywords - keywords = self._generate_keywords_only(summary, symbol.name) - - # 从docstring推断purpose - purpose = self._infer_purpose_from_docstring(doc_metadata) - - return SemanticMetadata( - summary=summary, - keywords=keywords, - purpose=purpose, - file_path=symbol.file_path if hasattr(symbol, 'file_path') else None, - symbol_name=symbol.name, - llm_tool="hybrid_docstring_primary", - ) - - def _refine_with_llm( - self, - file_data: FileData, - symbol: Symbol, - doc_metadata: DocstringMetadata - ) -> SemanticMetadata: - """策略2:让LLM精炼和增强docstring""" - - prompt = f""" -PURPOSE: Refine and enhance an existing docstring for better semantic search -TASK: -- Review the existing docstring -- Generate a concise summary (1-2 sentences) that captures the core purpose -- Extract 8-12 relevant keywords for search -- Identify the functional category/purpose - -EXISTING DOCSTRING: -{doc_metadata.raw_text} - -CODE CONTEXT: -Function: {symbol.name} -```{file_data.language} -{self._get_symbol_code(file_data.content, symbol)} -``` - -OUTPUT: JSON format -{{ - "summary": "refined summary based on docstring and code", - "keywords": ["keyword1", "keyword2", ...], - "purpose": "category" -}} -""" - - response = self.llm_enhancer._invoke_ccw_cli(prompt, tool='gemini') - if response['success']: - data = json.loads(self.llm_enhancer._extract_json(response['stdout'])) - return SemanticMetadata( - summary=data.get('summary', doc_metadata.summary), - keywords=data.get('keywords', []), - purpose=data.get('purpose', 'unknown'), - file_path=file_data.path, - symbol_name=symbol.name, - llm_tool="hybrid_llm_refined", - ) - - # Fallback: 使用docstring - return self._use_docstring_with_llm_keywords(symbol, doc_metadata) - - def _full_llm_generation( - self, - file_data: FileData, - symbol: Symbol - ) -> SemanticMetadata: - """策略3:完全由LLM生成(原有流程)""" - - # 复用现有的LLM enhancer - code_snippet = self._get_symbol_code(file_data.content, symbol) - - results = self.llm_enhancer.enhance_files([ - FileData( - path=f"{file_data.path}:{symbol.name}", - content=code_snippet, - language=file_data.language - ) - ]) - - return results.get(f"{file_data.path}:{symbol.name}", SemanticMetadata( - summary="", - keywords=[], - purpose="unknown", - file_path=file_data.path, - symbol_name=symbol.name, - llm_tool="hybrid_llm_full", - )) - - def _generate_keywords_only(self, summary: str, symbol_name: str) -> List[str]: - """仅生成keywords(快速LLM调用)""" - - prompt = f""" -PURPOSE: Generate search keywords for a code function -TASK: Extract 5-8 relevant keywords from the summary - -Summary: {summary} -Function Name: {symbol_name} - -OUTPUT: Comma-separated keywords -""" - - response = self.llm_enhancer._invoke_ccw_cli(prompt, tool='gemini') - if response['success']: - keywords_str = response['stdout'].strip() - return [k.strip() for k in keywords_str.split(',')] - - # Fallback: 从摘要提取关键词 - return self._extract_keywords_heuristic(summary) - - def _extract_keywords_heuristic(self, text: str) -> List[str]: - """启发式关键词提取(无需LLM)""" - - # 简单实现:提取名词性词组 - import re - words = re.findall(r'\b[a-z]{4,}\b', text.lower()) - - # 过滤常见词 - stopwords = {'this', 'that', 'with', 'from', 'have', 'will', 'your', 'their'} - keywords = [w for w in words if w not in stopwords] - - return list(set(keywords))[:8] - - def _infer_purpose_from_docstring(self, doc_metadata: DocstringMetadata) -> str: - """从docstring推断purpose(无需LLM)""" - - summary = doc_metadata.summary.lower() - - # 简单规则匹配 - if 'authenticate' in summary or 'login' in summary: - return 'auth' - elif 'validate' in summary or 'check' in summary: - return 'validation' - elif 'parse' in summary or 'format' in summary: - return 'data_processing' - elif 'api' in summary or 'endpoint' in summary: - return 'api' - elif 'database' in summary or 'query' in summary: - return 'data' - elif 'test' in summary: - return 'test' - - return 'util' - - def _get_symbol_code(self, content: str, symbol: Symbol) -> str: - """提取符号的代码""" - - lines = content.splitlines() - start, end = symbol.range - return '\n'.join(lines[start-1:end]) -``` - -### 3.3 成本优化统计 - -```python -@dataclass -class EnhancementStats: - """增强统计""" - total_symbols: int = 0 - used_docstring_only: int = 0 # 只使用docstring - llm_keywords_only: int = 0 # LLM只生成keywords - llm_refined: int = 0 # LLM精炼docstring - llm_full_generation: int = 0 # LLM完全生成 - total_llm_calls: int = 0 - estimated_cost_savings: float = 0.0 # 相比全用LLM节省的成本 - -class CostOptimizedEnhancer(HybridEnhancer): - """带成本统计的增强器""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.stats = EnhancementStats() - - def enhance_with_strategy( - self, - file_data: FileData, - symbols: List[Symbol] - ) -> Dict[str, SemanticMetadata]: - """增强并统计成本""" - - self.stats.total_symbols += len(symbols) - results = super().enhance_with_strategy(file_data, symbols) - - # 统计各策略使用情况 - for metadata in results.values(): - if metadata.llm_tool == "hybrid_docstring_primary": - self.stats.used_docstring_only += 1 - self.stats.llm_keywords_only += 1 - self.stats.total_llm_calls += 1 - elif metadata.llm_tool == "hybrid_llm_refined": - self.stats.llm_refined += 1 - self.stats.total_llm_calls += 1 - elif metadata.llm_tool == "hybrid_llm_full": - self.stats.llm_full_generation += 1 - self.stats.total_llm_calls += 1 - - # 计算成本节省(假设:keywords-only调用成本为full的20%) - keywords_only_savings = self.stats.llm_keywords_only * 0.8 # 节省80% - full_generation_count = self.stats.total_symbols - self.stats.llm_keywords_only - self.stats.estimated_cost_savings = keywords_only_savings / full_generation_count if full_generation_count > 0 else 0 - - return results - - def print_stats(self): - """打印统计信息""" - - print("=== Enhancement Statistics ===") - print(f"Total Symbols: {self.stats.total_symbols}") - print(f"Used Docstring (with LLM keywords): {self.stats.used_docstring_only} ({self.stats.used_docstring_only/self.stats.total_symbols*100:.1f}%)") - print(f"LLM Refined Docstring: {self.stats.llm_refined} ({self.stats.llm_refined/self.stats.total_symbols*100:.1f}%)") - print(f"LLM Full Generation: {self.stats.llm_full_generation} ({self.stats.llm_full_generation/self.stats.total_symbols*100:.1f}%)") - print(f"Total LLM Calls: {self.stats.total_llm_calls}") - print(f"Estimated Cost Savings: {self.stats.estimated_cost_savings*100:.1f}%") -``` - -## 4. 配置选项 - -```python -@dataclass -class HybridEnhancementConfig: - """混合增强配置""" - - # 是否启用混合策略(False则回退到全LLM模式) - enable_hybrid: bool = True - - # 质量阈值配置 - use_docstring_threshold: DocstringQuality = DocstringQuality.HIGH - refine_docstring_threshold: DocstringQuality = DocstringQuality.MEDIUM - - # 是否为高质量docstring生成keywords - generate_keywords_for_docstring: bool = True - - # LLM配置 - llm_tool: str = "gemini" - llm_timeout: int = 300000 - - # 成本优化 - batch_size: int = 5 # 批量处理大小 - skip_test_files: bool = True # 跳过测试文件(通常docstring较少) - - # 调试选项 - log_strategy_decisions: bool = False # 记录策略决策日志 -``` - -## 5. 测试策略 - -### 5.1 单元测试 - -```python -import pytest - -class TestDocstringExtractor: - """测试docstring提取""" - - def test_extract_google_style(self): - """测试Google风格docstring提取""" - code = ''' -def calculate_total(items, discount=0): - """Calculate total price with optional discount. - - This function processes a list of items and applies - a discount if specified. - - Args: - items (list): List of item objects with price attribute. - discount (float): Discount percentage (0-1). Defaults to 0. - - Returns: - float: Total price after discount. - - Examples: - >>> calculate_total([item1, item2], discount=0.1) - 90.0 - """ - total = sum(item.price for item in items) - return total * (1 - discount) -''' - extractor = DocstringExtractor() - symbol = Symbol(name='calculate_total', kind='function', range=(1, 18)) - docstring = extractor.extract_from_code(code, symbol) - - assert docstring is not None - metadata = extractor.parse_docstring(docstring) - - assert metadata.quality == DocstringQuality.HIGH - assert 'Calculate total price' in metadata.summary - assert metadata.parameters is not None - assert 'items' in metadata.parameters - assert metadata.returns is not None - assert metadata.examples is not None - - def test_extract_low_quality_docstring(self): - """测试低质量docstring识别""" - code = ''' -def process(): - """TODO""" - pass -''' - extractor = DocstringExtractor() - symbol = Symbol(name='process', kind='function', range=(1, 3)) - docstring = extractor.extract_from_code(code, symbol) - - metadata = extractor.parse_docstring(docstring) - assert metadata.quality == DocstringQuality.LOW - -class TestHybridEnhancer: - """测试混合增强器""" - - def test_high_quality_docstring_strategy(self): - """测试高质量docstring使用策略""" - - extractor = DocstringExtractor() - llm_enhancer = LLMEnhancer(LLMConfig(enabled=True)) - hybrid = HybridEnhancer(llm_enhancer, extractor) - - # 模拟高质量docstring - doc_metadata = DocstringMetadata( - raw_text="Validate user credentials against database.", - quality=DocstringQuality.HIGH, - summary="Validate user credentials against database." - ) - - symbol = Symbol(name='validate_user', kind='function', range=(1, 10)) - - result = hybrid._use_docstring_with_llm_keywords(symbol, doc_metadata) - - # 应该使用docstring的摘要 - assert result.summary == doc_metadata.summary - # 应该有keywords(可能由LLM或启发式生成) - assert len(result.keywords) > 0 - - def test_cost_optimization(self): - """测试成本优化效果""" - - enhancer = CostOptimizedEnhancer( - llm_enhancer=LLMEnhancer(LLMConfig(enabled=False)), # Mock - docstring_extractor=DocstringExtractor() - ) - - # 模拟处理10个symbol,其中5个有高质量docstring - # 预期:5个只调用keywords生成,5个完整LLM - # 总调用10次,但成本降低(keywords调用更便宜) - - # 实际测试需要mock LLM调用 - pass -``` - -### 5.2 集成测试 - -```python -class TestHybridEnhancementPipeline: - """测试完整的混合增强流程""" - - def test_full_pipeline(self): - """测试完整流程:代码 -> docstring提取 -> 质量评估 -> 策略选择 -> 增强""" - - code = ''' -def authenticate_user(username, password): - """Authenticate user with username and password. - - Args: - username (str): User's username - password (str): User's password - - Returns: - bool: True if authenticated, False otherwise - """ - # ... implementation - pass - -def helper_func(x): - # No docstring - return x * 2 -''' - - file_data = FileData(path='auth.py', content=code, language='python') - symbols = [ - Symbol(name='authenticate_user', kind='function', range=(1, 11)), - Symbol(name='helper_func', kind='function', range=(13, 15)), - ] - - extractor = DocstringExtractor() - llm_enhancer = LLMEnhancer(LLMConfig(enabled=True)) - hybrid = CostOptimizedEnhancer(llm_enhancer, extractor) - - results = hybrid.enhance_with_strategy(file_data, symbols) - - # authenticate_user 应该使用docstring - assert results['authenticate_user'].llm_tool == "hybrid_docstring_primary" - - # helper_func 应该完全LLM生成 - assert results['helper_func'].llm_tool == "hybrid_llm_full" - - # 统计 - assert hybrid.stats.total_symbols == 2 - assert hybrid.stats.used_docstring_only >= 1 - assert hybrid.stats.llm_full_generation >= 1 -``` - -## 6. 实施路线图 - -### Phase 1: 基础设施(1周) -- [x] 设计数据结构(DocstringMetadata, DocstringQuality) -- [ ] 实现DocstringExtractor(提取和解析) -- [ ] 支持Python docstring(Google/NumPy/reStructuredText风格) -- [ ] 单元测试 - -### Phase 2: 质量评估(1周) -- [ ] 实现质量评估算法 -- [ ] 启发式规则优化 -- [ ] 测试不同质量的docstring -- [ ] 调整阈值参数 - -### Phase 3: 混合策略(1-2周) -- [ ] 实现HybridEnhancer -- [ ] 三种策略实现(docstring-only, refine, full-llm) -- [ ] 策略选择逻辑 -- [ ] 集成测试 - -### Phase 4: 成本优化(1周) -- [ ] 实现CostOptimizedEnhancer -- [ ] 统计和监控 -- [ ] 批量处理优化 -- [ ] 性能测试 - -### Phase 5: 多语言支持(1-2周) -- [ ] JavaScript/TypeScript JSDoc -- [ ] Java Javadoc -- [ ] 其他语言docstring格式 - -### Phase 6: 集成与部署(1周) -- [ ] 集成到现有llm_enhancer -- [ ] CLI选项暴露 -- [ ] 配置文件支持 -- [ ] 文档和示例 - -**总计预估时间**:6-8周 - -## 7. 性能与成本分析 - -### 7.1 预期成本节省 - -假设场景:分析1000个函数 - -| Docstring质量分布 | 占比 | LLM调用策略 | 相对成本 | -|------------------|------|------------|---------| -| High (有详细docstring) | 30% | 只生成keywords | 20% | -| Medium (有基本docstring) | 40% | 精炼增强 | 60% | -| Low/Missing | 30% | 完全生成 | 100% | - -**总成本计算**: -- 纯LLM模式:1000 * 100% = 1000 units -- 混合模式:300*20% + 400*60% + 300*100% = 60 + 240 + 300 = 600 units -- **节省**:40% - -### 7.2 质量对比 - -| 指标 | 纯LLM模式 | 混合模式 | -|------|----------|---------| -| 准确性 | 中(可能有幻觉) | **高**(docstring权威) | -| 一致性 | 中(依赖prompt) | **高**(保留作者风格) | -| 覆盖率 | **高**(全覆盖) | 高(98%+) | -| 成本 | 高 | **低**(节省40%) | -| 速度 | 慢(所有文件) | **快**(减少LLM调用) | - -## 8. 潜在问题与解决方案 - -### 8.1 问题:Docstring过时 - -**现象**:代码已修改,但docstring未更新,导致信息不准确。 - -**解决方案**: -```python -class DocstringFreshnessChecker: - """检查docstring与代码的一致性""" - - def check_freshness( - self, - symbol: Symbol, - code: str, - doc_metadata: DocstringMetadata - ) -> bool: - """检查docstring是否与代码匹配""" - - # 检查1: 参数列表是否匹配 - if doc_metadata.parameters: - actual_params = self._extract_actual_parameters(code) - documented_params = set(doc_metadata.parameters.keys()) - - if actual_params != documented_params: - logger.warning( - f"Parameter mismatch in {symbol.name}: " - f"code has {actual_params}, doc has {documented_params}" - ) - return False - - # 检查2: 使用LLM验证一致性 - # TODO: 构建验证prompt - - return True -``` - -### 8.2 问题:不同docstring风格混用 - -**现象**:同一项目中使用多种docstring风格(Google, NumPy, 自定义)。 - -**解决方案**: -```python -class MultiStyleDocstringParser: - """支持多种docstring风格的解析器""" - - def parse(self, docstring: str) -> DocstringMetadata: - """自动检测并解析不同风格""" - - # 尝试各种解析器 - for parser in [ - GoogleStyleParser(), - NumpyStyleParser(), - ReStructuredTextParser(), - SimpleParser(), # Fallback - ]: - try: - metadata = parser.parse(docstring) - if metadata.quality != DocstringQuality.LOW: - return metadata - except Exception: - continue - - # 如果所有解析器都失败,返回简单解析结果 - return SimpleParser().parse(docstring) -``` - -### 8.3 问题:多语言docstring提取差异 - -**现象**:不同语言的docstring格式和位置不同。 - -**解决方案**: -```python -class LanguageSpecificExtractor: - """语言特定的docstring提取器""" - - def extract(self, language: str, code: str, symbol: Symbol) -> Optional[str]: - """根据语言选择合适的提取器""" - - extractors = { - 'python': PythonDocstringExtractor(), - 'javascript': JSDocExtractor(), - 'typescript': TSDocExtractor(), - 'java': JavadocExtractor(), - } - - extractor = extractors.get(language, GenericExtractor()) - return extractor.extract(code, symbol) - -class JSDocExtractor: - """JavaScript/TypeScript JSDoc提取器""" - - def extract(self, code: str, symbol: Symbol) -> Optional[str]: - """提取JSDoc注释""" - - lines = code.splitlines() - start_line = symbol.range[0] - 1 - - # 向上查找 /** ... */ 注释 - for i in range(start_line - 1, max(0, start_line - 20), -1): - if '*/' in lines[i]: - # 找到结束标记,向上提取 - return self._extract_jsdoc_block(lines, i) - - return None -``` - -## 9. 配置示例 - -### 9.1 配置文件 - -```yaml -# .codexlens/hybrid_enhancement.yaml - -hybrid_enhancement: - enabled: true - - # 质量阈值 - quality_thresholds: - use_docstring: high # high/medium/low - refine_docstring: medium - - # LLM选项 - llm: - tool: gemini - fallback: qwen - timeout_ms: 300000 - batch_size: 5 - - # 成本优化 - cost_optimization: - generate_keywords_for_docstring: true - skip_test_files: true - skip_private_methods: false - - # 语言支持 - languages: - python: - styles: [google, numpy, sphinx] - javascript: - styles: [jsdoc] - java: - styles: [javadoc] - - # 监控 - logging: - log_strategy_decisions: false - log_cost_savings: true -``` - -### 9.2 CLI使用 - -```bash -# 使用混合策略增强 -codex-lens enhance . --hybrid --tool gemini - -# 查看成本统计 -codex-lens enhance . --hybrid --show-stats - -# 仅对高质量docstring生成keywords -codex-lens enhance . --hybrid --keywords-only - -# 禁用混合模式,回退到纯LLM -codex-lens enhance . --no-hybrid --tool gemini -``` - -## 10. 成功指标 - -1. **成本节省**:相比纯LLM模式,降低API调用成本40%+ -2. **准确性提升**:使用docstring的符号,元数据准确率>95% -3. **覆盖率**:98%+的符号有语义元数据(docstring或LLM生成) -4. **速度提升**:整体处理速度提升30%+(减少LLM调用) -5. **用户满意度**:保留docstring信息,开发者认可度高 - -## 11. 参考资料 - -- [PEP 257 - Docstring Conventions](https://peps.python.org/pep-0257/) -- [Google Python Style Guide - Docstrings](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) -- [NumPy Docstring Standard](https://numpydoc.readthedocs.io/en/latest/format.html) -- [JSDoc Documentation](https://jsdoc.app/) -- [Javadoc Tool](https://docs.oracle.com/javase/8/docs/technotes/tools/windows/javadoc.html) diff --git a/codex-lens/docs/IMPLEMENTATION_SUMMARY.md b/codex-lens/docs/IMPLEMENTATION_SUMMARY.md index bb0ff5bd..09d46918 100644 --- a/codex-lens/docs/IMPLEMENTATION_SUMMARY.md +++ b/codex-lens/docs/IMPLEMENTATION_SUMMARY.md @@ -394,52 +394,32 @@ results = engine.search( - 指导用户如何生成嵌入 - 集成到搜索引擎日志中 -### ✅ LLM语义增强验证 (2025-12-16) +### ❌ LLM语义增强功能已移除 (2025-12-16) -**测试目标**: 验证LLM增强的向量搜索是否正常工作,对比纯向量搜索效果 +**移除原因**: 简化代码库,减少外部依赖 -**测试基础设施**: -- 创建测试套件 `tests/test_llm_enhanced_search.py` (550+ lines) -- 创建独立测试脚本 `scripts/compare_search_methods.py` (460+ lines) -- 创建完整文档 `docs/LLM_ENHANCED_SEARCH_GUIDE.md` (460+ lines) +**已移除内容**: +- `src/codexlens/semantic/llm_enhancer.py` - LLM增强核心模块 +- `src/codexlens/cli/commands.py` 中的 `enhance` 命令 +- `tests/test_llm_enhancer.py` - LLM增强测试 +- `tests/test_llm_enhanced_search.py` - LLM对比测试 +- `scripts/compare_search_methods.py` - 对比测试脚本 +- `scripts/test_misleading_comments.py` - 误导性注释测试 +- `scripts/show_llm_analysis.py` - LLM分析展示脚本 +- `scripts/inspect_llm_summaries.py` - LLM摘要检查工具 +- `docs/LLM_ENHANCED_SEARCH_GUIDE.md` - LLM使用指南 +- `docs/LLM_ENHANCEMENT_TEST_RESULTS.md` - LLM测试结果 +- `docs/MISLEADING_COMMENTS_TEST_RESULTS.md` - 误导性注释测试结果 +- `docs/CLI_INTEGRATION_SUMMARY.md` - CLI集成文档(包含enhance命令) +- `docs/DOCSTRING_LLM_HYBRID_DESIGN.md` - LLM混合策略设计 -**测试数据**: -- 5个真实Python代码样本 (认证、API、验证、数据库) -- 6个自然语言测试查询 -- 涵盖密码哈希、JWT令牌、用户API、邮箱验证、数据库连接等场景 +**保留功能**: +- ✅ 纯向量搜索 (pure_vector) 完整保留 +- ✅ 语义嵌入生成 (`codexlens embeddings-generate`) +- ✅ 语义嵌入状态检查 (`codexlens embeddings-status`) +- ✅ 所有核心搜索功能 -**测试结果** (2025-12-16): -``` -数据集: 5个Python文件, 5个查询 -测试工具: Gemini Flash 2.5 - -Setup Time: - - Pure Vector: 2.3秒 (直接嵌入代码) - - LLM-Enhanced: 174.2秒 (通过Gemini生成摘要, 75x slower) - -Accuracy: - - Pure Vector: 5/5 (100%) - 所有查询Rank 1 - - LLM-Enhanced: 5/5 (100%) - 所有查询Rank 1 - - Score: 15 vs 15 (平局) -``` - -**关键发现**: -1. ✅ **LLM增强功能正常工作** - - CCW CLI集成正常 - - Gemini API调用成功 - - 摘要生成和嵌入创建正常 - -2. **性能权衡** - - 索引阶段慢75倍 (LLM API调用开销) - - 查询阶段速度相同 (都是向量相似度搜索) - - 适合离线索引,在线查询场景 - -3. **准确性** - - 测试数据集太简单 (5文件,完美1:1映射) - - 两种方法都达到100%准确率 - - 需要更大、更复杂的代码库来显示差异 - -**结论**: LLM语义增强功能已验证可正常工作,可用于生产环境 +**历史记录**: LLM增强功能在测试中表现良好,但为简化维护和减少外部依赖(CCW CLI, Gemini/Qwen API)而移除。设计文档(DESIGN_EVALUATION_REPORT.md等)保留作为历史参考。 ### P2 - 中期(1-2月) diff --git a/codex-lens/docs/LLM_ENHANCED_SEARCH_GUIDE.md b/codex-lens/docs/LLM_ENHANCED_SEARCH_GUIDE.md deleted file mode 100644 index f4bcebda..00000000 --- a/codex-lens/docs/LLM_ENHANCED_SEARCH_GUIDE.md +++ /dev/null @@ -1,463 +0,0 @@ -# LLM-Enhanced Semantic Search Guide - -**Last Updated**: 2025-12-16 -**Status**: Experimental Feature - ---- - -## Overview - -CodexLens supports two approaches for semantic vector search: - -| Approach | Pipeline | Best For | -|----------|----------|----------| -| **Pure Vector** | Code → fastembed → search | Code pattern matching, exact functionality | -| **LLM-Enhanced** | Code → LLM summary → fastembed → search | Natural language queries, conceptual search | - -### Why LLM Enhancement? - -**Problem**: Raw code embeddings don't match natural language well. - -``` -Query: "How do I hash passwords securely?" -Raw code: def hash_password(password: str) -> str: ... -Mismatch: Low semantic similarity -``` - -**Solution**: LLM generates natural language summaries. - -``` -Query: "How do I hash passwords securely?" -LLM Summary: "Hash a password using bcrypt with specified salt rounds for secure storage" -Match: High semantic similarity ✓ -``` - -## Architecture - -### Pure Vector Search Flow - -``` -1. Code File - └→ "def hash_password(password: str): ..." - -2. Chunking - └→ Split into semantic chunks (500-2000 chars) - -3. Embedding (fastembed) - └→ Generate 768-dim vector from raw code - -4. Storage - └→ Store vector in semantic_chunks table - -5. Query - └→ "How to hash passwords" - └→ Generate query vector - └→ Find similar vectors (cosine similarity) -``` - -**Pros**: Fast, no external dependencies, good for code patterns -**Cons**: Poor semantic match for natural language queries - -### LLM-Enhanced Search Flow - -``` -1. Code File - └→ "def hash_password(password: str): ..." - -2. LLM Analysis (Gemini/Qwen via CCW) - └→ Generate summary: "Hash a password using bcrypt..." - └→ Extract keywords: ["password", "hash", "bcrypt", "security"] - └→ Identify purpose: "auth" - -3. Embeddable Text Creation - └→ Combine: summary + keywords + purpose + filename - -4. Embedding (fastembed) - └→ Generate 768-dim vector from LLM text - -5. Storage - └→ Store vector with metadata - -6. Query - └→ "How to hash passwords" - └→ Generate query vector - └→ Find similar vectors → Better match! ✓ -``` - -**Pros**: Excellent semantic match for natural language -**Cons**: Slower, requires CCW CLI and LLM access - -## Setup Requirements - -### 1. Install Dependencies - -```bash -# Install semantic search dependencies -pip install codexlens[semantic] - -# Install CCW CLI for LLM enhancement -npm install -g ccw -``` - -### 2. Configure LLM Tools - -```bash -# Set primary LLM tool (default: gemini) -export CCW_CLI_SECONDARY_TOOL=gemini - -# Set fallback tool (default: qwen) -export CCW_CLI_FALLBACK_TOOL=qwen - -# Configure API keys (see CCW documentation) -ccw config set gemini.apiKey YOUR_API_KEY -``` - -### 3. Verify Setup - -```bash -# Check CCW availability -ccw --version - -# Check semantic dependencies -python -c "from codexlens.semantic import SEMANTIC_AVAILABLE; print(SEMANTIC_AVAILABLE)" -``` - -## Running Comparison Tests - -### Method 1: Standalone Script (Recommended) - -```bash -# Run full comparison (pure vector + LLM-enhanced) -python scripts/compare_search_methods.py - -# Use specific LLM tool -python scripts/compare_search_methods.py --tool gemini -python scripts/compare_search_methods.py --tool qwen - -# Skip LLM test (only pure vector) -python scripts/compare_search_methods.py --skip-llm -``` - -**Output Example**: - -``` -====================================================================== -SEMANTIC SEARCH COMPARISON TEST -Pure Vector vs LLM-Enhanced Vector Search -====================================================================== - -Test dataset: 5 Python files -Test queries: 5 natural language questions - -====================================================================== -PURE VECTOR SEARCH (Code → fastembed) -====================================================================== -Setup: 5 files, 23 chunks in 2.3s - -Query Top Result Score ----------------------------------------------------------------------- -✓ How do I securely hash passwords? password_hasher.py 0.723 -✗ Generate JWT token for authentication user_endpoints.py 0.645 -✓ Create new user account via API user_endpoints.py 0.812 -✓ Validate email address format validation.py 0.756 -~ Connect to PostgreSQL database connection.py 0.689 - -====================================================================== -LLM-ENHANCED SEARCH (Code → GEMINI → fastembed) -====================================================================== -Generating LLM summaries for 5 files... -Setup: 5/5 files indexed in 8.7s - -Query Top Result Score ----------------------------------------------------------------------- -✓ How do I securely hash passwords? password_hasher.py 0.891 -✓ Generate JWT token for authentication jwt_handler.py 0.867 -✓ Create new user account via API user_endpoints.py 0.923 -✓ Validate email address format validation.py 0.845 -✓ Connect to PostgreSQL database connection.py 0.801 - -====================================================================== -COMPARISON SUMMARY -====================================================================== - -Query Pure LLM ----------------------------------------------------------------------- -How do I securely hash passwords? ✓ Rank 1 ✓ Rank 1 -Generate JWT token for authentication ✗ Miss ✓ Rank 1 -Create new user account via API ✓ Rank 1 ✓ Rank 1 -Validate email address format ✓ Rank 1 ✓ Rank 1 -Connect to PostgreSQL database ~ Rank 2 ✓ Rank 1 ----------------------------------------------------------------------- -TOTAL SCORE 11 15 -====================================================================== - -ANALYSIS: -✓ LLM enhancement improves results by 36.4% - Natural language summaries match queries better than raw code -``` - -### Method 2: Pytest Test Suite - -```bash -# Run full test suite -pytest tests/test_llm_enhanced_search.py -v -s - -# Run specific test -pytest tests/test_llm_enhanced_search.py::TestSearchComparison::test_comparison -v -s - -# Skip LLM tests if CCW not available -pytest tests/test_llm_enhanced_search.py -v -s -k "not llm_enhanced" -``` - -## Using LLM Enhancement in Production - -### Option 1: Enhanced Embeddings Generation (Recommended) - -Create embeddings with LLM enhancement during indexing: - -```python -from pathlib import Path -from codexlens.semantic.llm_enhancer import create_enhanced_indexer, FileData - -# Create enhanced indexer -indexer = create_enhanced_indexer( - vector_store_path=Path("~/.codexlens/indexes/project/_index.db"), - llm_tool="gemini", - llm_enabled=True, -) - -# Prepare file data -files = [ - FileData( - path="auth/password_hasher.py", - content=open("auth/password_hasher.py").read(), - language="python" - ), - # ... more files -] - -# Index with LLM enhancement -indexed_count = indexer.index_files(files) -print(f"Indexed {indexed_count} files with LLM enhancement") -``` - -### Option 2: CLI Integration (Coming Soon) - -```bash -# Generate embeddings with LLM enhancement -codexlens embeddings-generate ~/projects/my-app --llm-enhanced --tool gemini - -# Check which strategy was used -codexlens embeddings-status ~/projects/my-app --show-strategies -``` - -**Note**: CLI integration is planned but not yet implemented. Currently use Option 1 (Python API). - -### Option 3: Hybrid Approach - -Combine both strategies for best results: - -```python -# Generate both pure and LLM-enhanced embeddings -# 1. Pure vector for exact code matching -generate_pure_embeddings(files) - -# 2. LLM-enhanced for semantic matching -generate_llm_embeddings(files) - -# Search uses both and ranks by best match -``` - -## Performance Considerations - -### Speed Comparison - -| Approach | Indexing Time (100 files) | Query Time | Cost | -|----------|---------------------------|------------|------| -| Pure Vector | ~30s | ~50ms | Free | -| LLM-Enhanced | ~5-10 min | ~50ms | LLM API costs | - -**LLM indexing is slower** because: -- Calls external LLM API (gemini/qwen) -- Processes files in batches (default: 5 files/batch) -- Waits for LLM response (~2-5s per batch) - -**Query speed is identical** because: -- Both use fastembed for similarity search -- Vector lookup is same speed -- Difference is only in what was embedded - -### Cost Estimation - -**Gemini Flash (via CCW)**: -- ~$0.10 per 1M input tokens -- Average: ~500 tokens per file -- 100 files = ~$0.005 (half a cent) - -**Qwen (local)**: -- Free if running locally -- Slower than Gemini Flash - -### When to Use Each Approach - -| Use Case | Recommendation | -|----------|----------------| -| **Code pattern search** | Pure vector (e.g., "find all REST endpoints") | -| **Natural language queries** | LLM-enhanced (e.g., "how to authenticate users") | -| **Large codebase** | Pure vector first, LLM for important modules | -| **Personal projects** | LLM-enhanced (cost is minimal) | -| **Enterprise** | Hybrid approach | - -## Configuration Options - -### LLM Config - -```python -from codexlens.semantic.llm_enhancer import LLMConfig, LLMEnhancer - -config = LLMConfig( - tool="gemini", # Primary LLM tool - fallback_tool="qwen", # Fallback if primary fails - timeout_ms=300000, # 5 minute timeout - batch_size=5, # Files per batch - max_content_chars=8000, # Max chars per file in prompt - enabled=True, # Enable/disable LLM -) - -enhancer = LLMEnhancer(config) -``` - -### Environment Variables - -```bash -# Override default LLM tool -export CCW_CLI_SECONDARY_TOOL=gemini - -# Override fallback tool -export CCW_CLI_FALLBACK_TOOL=qwen - -# Disable LLM enhancement (fall back to pure vector) -export CODEXLENS_LLM_ENABLED=false -``` - -## Troubleshooting - -### Issue 1: CCW CLI Not Found - -**Error**: `CCW CLI not found in PATH, LLM enhancement disabled` - -**Solution**: -```bash -# Install CCW globally -npm install -g ccw - -# Verify installation -ccw --version - -# Check PATH -which ccw # Unix -where ccw # Windows -``` - -### Issue 2: LLM API Errors - -**Error**: `LLM call failed: HTTP 429 Too Many Requests` - -**Solution**: -- Reduce batch size in LLMConfig -- Add delay between batches -- Check API quota/limits -- Try fallback tool (qwen) - -### Issue 3: Poor LLM Summaries - -**Symptom**: LLM summaries are too generic or inaccurate - -**Solution**: -- Try different LLM tool (gemini vs qwen) -- Increase max_content_chars (default 8000) -- Manually review and refine summaries -- Fall back to pure vector for code-heavy files - -### Issue 4: Slow Indexing - -**Symptom**: Indexing takes too long with LLM enhancement - -**Solution**: -```python -# Reduce batch size for faster feedback -config = LLMConfig(batch_size=2) # Default is 5 - -# Or use pure vector for large files -if file_size > 10000: - use_pure_vector() -else: - use_llm_enhanced() -``` - -## Example Test Queries - -### Good for LLM-Enhanced Search - -```python -# Natural language, conceptual queries -"How do I authenticate users with JWT?" -"Validate email addresses before saving to database" -"Secure password storage with hashing" -"Create REST API endpoint for user registration" -"Connect to PostgreSQL with connection pooling" -``` - -### Good for Pure Vector Search - -```python -# Code-specific, pattern-matching queries -"bcrypt.hashpw" -"jwt.encode" -"@app.route POST" -"re.match email" -"psycopg2.pool.SimpleConnectionPool" -``` - -### Best: Combine Both - -Use LLM-enhanced for high-level search, then pure vector for refinement: - -```python -# Step 1: LLM-enhanced for semantic search -results = search_llm_enhanced("user authentication with tokens") -# Returns: jwt_handler.py, password_hasher.py, user_endpoints.py - -# Step 2: Pure vector for exact code pattern -results = search_pure_vector("jwt.encode") -# Returns: jwt_handler.py (exact match) -``` - -## Future Improvements - -- [ ] CLI integration for `--llm-enhanced` flag -- [ ] Incremental LLM summary updates -- [ ] Caching LLM summaries to reduce API calls -- [ ] Hybrid search combining both approaches -- [ ] Custom prompt templates for specific domains -- [ ] Local LLM support (ollama, llama.cpp) - -## Related Documentation - -- `PURE_VECTOR_SEARCH_GUIDE.md` - Pure vector search usage -- `IMPLEMENTATION_SUMMARY.md` - Technical implementation details -- `scripts/compare_search_methods.py` - Comparison test script -- `tests/test_llm_enhanced_search.py` - Test suite - -## References - -- **LLM Enhancer Implementation**: `src/codexlens/semantic/llm_enhancer.py` -- **CCW CLI Documentation**: https://github.com/anthropics/ccw -- **Fastembed**: https://github.com/qdrant/fastembed - ---- - -**Questions?** Run the comparison script to see LLM enhancement in action: -```bash -python scripts/compare_search_methods.py -``` diff --git a/codex-lens/docs/LLM_ENHANCEMENT_TEST_RESULTS.md b/codex-lens/docs/LLM_ENHANCEMENT_TEST_RESULTS.md deleted file mode 100644 index 5736e3ee..00000000 --- a/codex-lens/docs/LLM_ENHANCEMENT_TEST_RESULTS.md +++ /dev/null @@ -1,232 +0,0 @@ -# LLM语义增强测试结果 - -**测试日期**: 2025-12-16 -**状态**: ✅ 通过 - LLM增强功能正常工作 - ---- - -## 📊 测试结果概览 - -### 测试配置 - -| 项目 | 配置 | -|------|------| -| **测试工具** | Gemini Flash 2.5 (via CCW CLI) | -| **测试数据** | 5个Python代码文件 | -| **查询数量** | 5个自然语言查询 | -| **嵌入模型** | BAAI/bge-small-en-v1.5 (768维) | - -### 性能对比 - -| 指标 | 纯向量搜索 | LLM增强搜索 | 差异 | -|------|-----------|------------|------| -| **索引时间** | 2.3秒 | 174.2秒 | 75倍慢 | -| **查询速度** | ~50ms | ~50ms | 相同 | -| **准确率** | 5/5 (100%) | 5/5 (100%) | 相同 | -| **排名得分** | 15/15 | 15/15 | 平局 | - -### 详细结果 - -所有5个查询都找到了正确的文件 (Rank 1): - -| 查询 | 预期文件 | 纯向量 | LLM增强 | -|------|---------|--------|---------| -| 如何安全地哈希密码? | password_hasher.py | [OK] Rank 1 | [OK] Rank 1 | -| 生成JWT令牌进行认证 | jwt_handler.py | [OK] Rank 1 | [OK] Rank 1 | -| 通过API创建新用户账户 | user_endpoints.py | [OK] Rank 1 | [OK] Rank 1 | -| 验证电子邮件地址格式 | validation.py | [OK] Rank 1 | [OK] Rank 1 | -| 连接到PostgreSQL数据库 | connection.py | [OK] Rank 1 | [OK] Rank 1 | - ---- - -## ✅ 验证结论 - -### 1. LLM增强功能工作正常 - -- ✅ **CCW CLI集成**: 成功调用外部CLI工具 -- ✅ **Gemini API**: API调用成功,无错误 -- ✅ **摘要生成**: LLM成功生成代码摘要和关键词 -- ✅ **嵌入创建**: 从摘要成功生成768维向量 -- ✅ **向量存储**: 正确存储到semantic_chunks表 -- ✅ **搜索准确性**: 100%准确匹配所有查询 - -### 2. 性能权衡分析 - -**优势**: -- 查询速度与纯向量相同 (~50ms) -- 更好的语义理解能力 (理论上) -- 适合自然语言查询 - -**劣势**: -- 索引阶段慢75倍 (174s vs 2.3s) -- 需要外部LLM API (成本) -- 需要安装和配置CCW CLI - -**适用场景**: -- 离线索引,在线查询 -- 个人项目 (成本可忽略) -- 重视自然语言查询体验 - -### 3. 测试数据集局限性 - -**当前测试太简单**: -- 仅5个文件 -- 每个查询完美对应1个文件 -- 没有歧义或相似文件 -- 两种方法都能轻松找到 - -**预期在真实场景**: -- 数百或数千个文件 -- 多个相似功能的文件 -- 模糊或概念性查询 -- LLM增强应该表现更好 - ---- - -## 🛠️ 测试基础设施 - -### 创建的文件 - -1. **测试套件** (`tests/test_llm_enhanced_search.py`) - - 550+ lines - - 完整pytest测试 - - 3个测试类 (纯向量, LLM增强, 对比) - -2. **独立脚本** (`scripts/compare_search_methods.py`) - - 460+ lines - - 可直接运行: `python scripts/compare_search_methods.py` - - 支持参数: `--tool gemini|qwen`, `--skip-llm` - - 详细对比报告 - -3. **完整文档** (`docs/LLM_ENHANCED_SEARCH_GUIDE.md`) - - 460+ lines - - 架构对比图 - - 设置说明 - - 使用示例 - - 故障排除 - -### 运行测试 - -```bash -# 方式1: 独立脚本 (推荐) -python scripts/compare_search_methods.py --tool gemini - -# 方式2: Pytest -pytest tests/test_llm_enhanced_search.py::TestSearchComparison::test_comparison -v -s - -# 跳过LLM测试 (仅测试纯向量) -python scripts/compare_search_methods.py --skip-llm -``` - -### 前置要求 - -```bash -# 1. 安装语义搜索依赖 -pip install codexlens[semantic] - -# 2. 安装CCW CLI -npm install -g ccw - -# 3. 配置API密钥 -ccw config set gemini.apiKey YOUR_API_KEY -``` - ---- - -## 🔍 架构对比 - -### 纯向量搜索流程 - -``` -代码文件 → 分块 → fastembed (768维) → semantic_chunks表 → 向量搜索 -``` - -**优点**: 快速、无需外部依赖、直接嵌入代码 -**缺点**: 对自然语言查询理解较弱 - -### LLM增强搜索流程 - -``` -代码文件 → CCW CLI调用Gemini → 生成摘要+关键词 → fastembed (768维) → semantic_chunks表 → 向量搜索 -``` - -**优点**: 更好的语义理解、适合自然语言查询 -**缺点**: 索引慢75倍、需要LLM API、有成本 - ---- - -## 💰 成本估算 - -### Gemini Flash (via CCW) - -- 价格: ~$0.10 / 1M input tokens -- 平均: ~500 tokens / 文件 -- 100文件成本: ~$0.005 (半分钱) - -### Qwen (本地) - -- 价格: 免费 (本地运行) -- 速度: 比Gemini Flash慢 - ---- - -## 📝 修复的问题 - -### 1. Unicode编码问题 - -**问题**: Windows GBK控制台无法显示Unicode符号 (✓, ✗, •) -**修复**: 替换为ASCII符号 ([OK], [X], -) - -**影响文件**: -- `scripts/compare_search_methods.py` -- `tests/test_llm_enhanced_search.py` - -### 2. 数据库文件锁定 - -**问题**: Windows无法删除临时数据库 (PermissionError) -**修复**: 添加垃圾回收和异常处理 - -```python -import gc -gc.collect() # 强制关闭连接 -time.sleep(0.1) # 等待Windows释放文件句柄 -``` - -### 3. 正则表达式警告 - -**问题**: SyntaxWarning about invalid escape sequence `\.` -**状态**: 无害警告,正则表达式正常工作 - ---- - -## 🎯 结论和建议 - -### 核心发现 - -1. ✅ **LLM语义增强功能已验证可用** -2. ✅ **测试基础设施完整** -3. ⚠️ **测试数据集需扩展** (当前太简单) - -### 使用建议 - -| 场景 | 推荐方案 | -|------|---------| -| 代码模式搜索 | 纯向量 (如 "find all REST endpoints") | -| 自然语言查询 | LLM增强 (如 "how to authenticate users") | -| 大型代码库 | 纯向量优先,重要模块用LLM | -| 个人项目 | LLM增强 (成本可忽略) | -| 企业级应用 | 混合方案 | - -### 后续工作 (可选) - -- [ ] 使用更大的测试数据集 (100+ files) -- [ ] 测试更复杂的查询 (概念性、模糊查询) -- [ ] 性能优化 (批量LLM调用) -- [ ] 成本优化 (缓存LLM摘要) -- [ ] 混合搜索 (结合两种方法) - ---- - -**完成时间**: 2025-12-16 -**测试执行者**: Claude (Sonnet 4.5) -**文档版本**: 1.0 diff --git a/codex-lens/docs/LLM_REMOVAL_SUMMARY.md b/codex-lens/docs/LLM_REMOVAL_SUMMARY.md new file mode 100644 index 00000000..30b090d0 --- /dev/null +++ b/codex-lens/docs/LLM_REMOVAL_SUMMARY.md @@ -0,0 +1,342 @@ +# LLM增强功能移除总结 + +**移除日期**: 2025-12-16 +**执行者**: 用户请求 +**状态**: ✅ 完成 + +--- + +## 📋 移除清单 + +### ✅ 已删除的源代码文件 + +| 文件 | 说明 | +|------|------| +| `src/codexlens/semantic/llm_enhancer.py` | LLM增强核心模块 (900+ lines) | + +### ✅ 已修改的源代码文件 + +| 文件 | 修改内容 | +|------|---------| +| `src/codexlens/cli/commands.py` | 删除 `enhance` 命令 (lines 1050-1227) | +| `src/codexlens/semantic/__init__.py` | 删除LLM相关导出 (lines 35-69) | + +### ✅ 已修改的前端文件(CCW Dashboard) + +| 文件 | 修改内容 | +|------|---------| +| `ccw/src/templates/dashboard-js/components/cli-status.js` | 删除LLM增强设置 (8行)、Semantic Settings Modal (615行)、Metadata Viewer (326行) | +| `ccw/src/templates/dashboard-js/i18n.js` | 删除英文LLM翻译 (26行)、中文LLM翻译 (26行) | +| `ccw/src/templates/dashboard-js/views/cli-manager.js` | 移除LLM badge和设置modal调用 (3行) | + +### ✅ 已删除的测试文件 + +| 文件 | 说明 | +|------|------| +| `tests/test_llm_enhancer.py` | LLM增强单元测试 | +| `tests/test_llm_enhanced_search.py` | LLM vs 纯向量对比测试 (550+ lines) | + +### ✅ 已删除的脚本文件 + +| 文件 | 说明 | +|------|------| +| `scripts/compare_search_methods.py` | 纯向量 vs LLM增强对比脚本 (460+ lines) | +| `scripts/test_misleading_comments.py` | 误导性注释测试脚本 (490+ lines) | +| `scripts/show_llm_analysis.py` | LLM分析展示工具 | +| `scripts/inspect_llm_summaries.py` | LLM摘要检查工具 | + +### ✅ 已删除的文档文件 + +| 文件 | 说明 | +|------|------| +| `docs/LLM_ENHANCED_SEARCH_GUIDE.md` | LLM增强使用指南 (460+ lines) | +| `docs/LLM_ENHANCEMENT_TEST_RESULTS.md` | LLM测试结果文档 | +| `docs/MISLEADING_COMMENTS_TEST_RESULTS.md` | 误导性注释测试结果 | +| `docs/CLI_INTEGRATION_SUMMARY.md` | CLI集成文档(包含enhance命令) | +| `docs/DOCSTRING_LLM_HYBRID_DESIGN.md` | Docstring与LLM混合策略设计 | + +### ✅ 已更新的文档 + +| 文件 | 修改内容 | +|------|---------| +| `docs/IMPLEMENTATION_SUMMARY.md` | 添加LLM移除说明,列出已删除内容 | + +### 📚 保留的设计文档(作为历史参考) + +| 文件 | 说明 | +|------|------| +| `docs/DESIGN_EVALUATION_REPORT.md` | 包含LLM混合策略的技术评估报告 | +| `docs/SEMANTIC_GRAPH_DESIGN.md` | 语义图谱设计(可能提及LLM) | +| `docs/MULTILEVEL_CHUNKER_DESIGN.md` | 多层次分词器设计(可能提及LLM) | + +*这些文档保留作为技术历史参考,不影响当前功能。* + +--- + +## 🔒 移除的功能 + +### CLI命令 + +```bash +# 已移除 - 不再可用 +codexlens enhance [PATH] --tool gemini --batch-size 5 + +# 说明:此命令用于通过CCW CLI调用Gemini/Qwen生成代码摘要 +# 移除原因:减少外部依赖,简化维护 +``` + +### Python API + +```python +# 已移除 - 不再可用 +from codexlens.semantic import ( + LLMEnhancer, + LLMConfig, + SemanticMetadata, + FileData, + EnhancedSemanticIndexer, + create_enhancer, + create_enhanced_indexer, +) + +# 移除的类和函数: +# - LLMEnhancer: LLM增强器主类 +# - LLMConfig: LLM配置类 +# - SemanticMetadata: 语义元数据结构 +# - FileData: 文件数据结构 +# - EnhancedSemanticIndexer: LLM增强索引器 +# - create_enhancer(): 创建增强器的工厂函数 +# - create_enhanced_indexer(): 创建增强索引器的工厂函数 +``` + +--- + +## ✅ 保留的功能 + +### 完全保留的核心功能 + +| 功能 | 状态 | +|------|------| +| **纯向量搜索** | ✅ 完整保留 | +| **语义嵌入生成** | ✅ 完整保留 (`codexlens embeddings-generate`) | +| **语义嵌入状态检查** | ✅ 完整保留 (`codexlens embeddings-status`) | +| **混合搜索引擎** | ✅ 完整保留(exact + fuzzy + vector) | +| **向量存储** | ✅ 完整保留 | +| **语义分块** | ✅ 完整保留 | +| **fastembed集成** | ✅ 完整保留 | + +### 可用的CLI命令 + +```bash +# 生成纯向量嵌入(无需LLM) +codexlens embeddings-generate [PATH] + +# 检查嵌入状态 +codexlens embeddings-status [PATH] + +# 所有搜索命令 +codexlens search [QUERY] --index [PATH] + +# 所有索引管理命令 +codexlens init [PATH] +codexlens update [PATH] +codexlens clean [PATH] +``` + +### 可用的Python API + +```python +# 完全可用 - 纯向量搜索 +from codexlens.semantic import SEMANTIC_AVAILABLE, SEMANTIC_BACKEND +from codexlens.semantic.embedder import Embedder +from codexlens.semantic.vector_store import VectorStore +from codexlens.semantic.chunker import Chunker, ChunkConfig +from codexlens.search.hybrid_search import HybridSearchEngine + +# 示例:纯向量搜索 +engine = HybridSearchEngine() +results = engine.search( + index_path, + query="your search query", + enable_vector=True, + pure_vector=True, # 纯向量模式 +) +``` + +--- + +## 🎯 移除原因 + +### 1. 简化依赖 + +**移除的外部依赖**: +- CCW CLI (npm package) +- Gemini API (需要API密钥) +- Qwen API (可选) + +**保留的依赖**: +- fastembed (ONNX-based,轻量级) +- numpy +- Python标准库 + +### 2. 减少复杂性 + +- **前**: 两种搜索方式(纯向量 + LLM增强) +- **后**: 一种搜索方式(纯向量) +- 移除了900+ lines的LLM增强代码 +- 移除了CLI命令和相关配置 +- 移除了测试和文档 + +### 3. 性能考虑 + +| 方面 | LLM增强 | 纯向量 | +|------|---------|--------| +| **索引速度** | 慢75倍 | 基准 | +| **查询速度** | 相同 | 相同 | +| **准确率** | 相同* | 基准 | +| **成本** | API费用 | 免费 | + +*在测试数据集上准确率相同(5/5),但LLM增强理论上在更复杂场景下可能更好 + +### 4. 维护负担 + +**移除前**: +- 需要维护CCW CLI集成 +- 需要处理API限流和错误 +- 需要测试多个LLM后端 +- 需要维护批处理逻辑 + +**移除后**: +- 单一嵌入引擎(fastembed) +- 无外部API依赖 +- 更简单的错误处理 +- 更容易测试 + +--- + +## 🔍 验证结果 + +### 导入测试 + +```bash +# ✅ 通过 - 语义模块正常 +python -c "from codexlens.semantic import SEMANTIC_AVAILABLE; print(SEMANTIC_AVAILABLE)" +# Output: True + +# ✅ 通过 - 搜索引擎正常 +python -c "from codexlens.search.hybrid_search import HybridSearchEngine; print('OK')" +# Output: OK +``` + +### 代码清洁度验证 + +```bash +# ✅ 通过 - 无遗留LLM引用 +grep -r "llm_enhancer\|LLMEnhancer\|LLMConfig" src/ --include="*.py" +# Output: (空) +``` + +### 测试结果 + +```bash +# ✅ 5/7通过 - 纯向量搜索基本功能正常 +pytest tests/test_pure_vector_search.py -v +# 通过: 5个基本测试 +# 失败: 2个嵌入测试(已知的模型维度不匹配问题,与LLM移除无关) +``` + +--- + +## 📊 统计 + +### 代码删除统计 + +| 类型 | 删除文件数 | 删除行数(估计) | +|------|-----------|-----------------| +| **源代码** | 1 | ~900 lines | +| **CLI命令** | 1 command | ~180 lines | +| **导出清理** | 1 section | ~35 lines | +| **前端代码** | 3 files | ~1000 lines | +| **测试文件** | 2 | ~600 lines | +| **脚本工具** | 4 | ~1500 lines | +| **文档** | 5 | ~2000 lines | +| **总计** | 16 files/sections | ~6200 lines | + +### 依赖简化 + +| 方面 | 移除前 | 移除后 | +|------|--------|--------| +| **外部工具依赖** | CCW CLI, Gemini/Qwen | 无 | +| **Python包依赖** | fastembed, numpy | fastembed, numpy | +| **API依赖** | Gemini/Qwen API | 无 | +| **配置复杂度** | 高(tool, batch_size, API keys) | 低(model profile) | + +--- + +## 🚀 后续建议 + +### 如果需要LLM增强功能 + +1. **从git历史恢复** + ```bash + # 查看删除前的提交 + git log --all --full-history -- "*llm_enhancer*" + + # 恢复特定文件 + git checkout -- src/codexlens/semantic/llm_enhancer.py + ``` + +2. **或使用外部工具** + - 在索引前使用独立脚本生成摘要 + - 将摘要作为注释添加到代码中 + - 然后使用纯向量索引(会包含摘要) + +3. **或考虑轻量级替代方案** + - 使用本地小模型(llama.cpp, ggml) + - 使用docstring提取(无需LLM) + - 使用静态分析生成摘要 + +### 代码库维护建议 + +1. ✅ **保持简单** - 继续使用纯向量搜索 +2. ✅ **优化现有功能** - 改进向量搜索准确性 +3. ✅ **增量改进** - 优化分块策略和嵌入质量 +4. ⚠️ **避免重复** - 如需LLM,先评估是否真正必要 + +--- + +## 📝 文件清单 + +### 删除的文件完整列表 + +``` +src/codexlens/semantic/llm_enhancer.py +tests/test_llm_enhancer.py +tests/test_llm_enhanced_search.py +scripts/compare_search_methods.py +scripts/test_misleading_comments.py +scripts/show_llm_analysis.py +scripts/inspect_llm_summaries.py +docs/LLM_ENHANCED_SEARCH_GUIDE.md +docs/LLM_ENHANCEMENT_TEST_RESULTS.md +docs/MISLEADING_COMMENTS_TEST_RESULTS.md +docs/CLI_INTEGRATION_SUMMARY.md +docs/DOCSTRING_LLM_HYBRID_DESIGN.md +``` + +### 修改的文件 + +``` +src/codexlens/cli/commands.py (删除enhance命令) +src/codexlens/semantic/__init__.py (删除LLM导出) +ccw/src/templates/dashboard-js/components/cli-status.js (删除LLM配置、Settings Modal、Metadata Viewer) +ccw/src/templates/dashboard-js/i18n.js (删除LLM翻译字符串) +ccw/src/templates/dashboard-js/views/cli-manager.js (移除LLM badge和modal调用) +docs/IMPLEMENTATION_SUMMARY.md (添加移除说明) +``` + +--- + +**移除完成时间**: 2025-12-16 +**文档版本**: 1.0 +**验证状态**: ✅ 通过 diff --git a/codex-lens/docs/MISLEADING_COMMENTS_TEST_RESULTS.md b/codex-lens/docs/MISLEADING_COMMENTS_TEST_RESULTS.md deleted file mode 100644 index c2fc8afa..00000000 --- a/codex-lens/docs/MISLEADING_COMMENTS_TEST_RESULTS.md +++ /dev/null @@ -1,301 +0,0 @@ -# 误导性注释测试结果 - -**测试日期**: 2025-12-16 -**测试目的**: 验证LLM增强搜索是否能克服错误/缺失的代码注释 - ---- - -## 📊 测试结果总结 - -### 性能对比 - -| 方法 | 索引时间 | 准确率 | 得分 | 结论 | -|------|---------|--------|------|------| -| **纯向量搜索** | 2.1秒 | 5/5 (100%) | 15/15 | ✅ 未被误导性注释影响 | -| **LLM增强搜索** | 103.7秒 | 5/5 (100%) | 15/15 | ✅ 正确识别实际功能 | - -**结论**: 平局 - 两种方法都能正确处理误导性注释 - ---- - -## 🧪 测试数据集设计 - -### 误导性代码样本 (5个文件) - -| 文件 | 错误注释 | 实际功能 | 误导程度 | -|------|---------|---------|---------| -| `crypto/hasher.py` | "Simple string utilities" | bcrypt密码哈希 | 高 | -| `auth/token.py` | 无注释,模糊函数名 | JWT令牌生成 | 中 | -| `api/handlers.py` | "Database utilities", 反向docstrings | REST API用户管理 | 极高 | -| `utils/checker.py` | "Math calculation functions" | 邮箱地址验证 | 高 | -| `db/pool.py` | "Email sending service" | PostgreSQL连接池 | 极高 | - -### 具体误导示例 - -#### 示例 1: 完全错误的模块描述 - -```python -"""Email sending service.""" # 错误! -import psycopg2 # 实际是数据库库 -from psycopg2 import pool - -class EmailSender: # 错误的类名 - """SMTP email sender with retry logic.""" # 错误! - - def __init__(self, min_conn: int = 1, max_conn: int = 10): - """Initialize email sender.""" # 错误! - self.pool = psycopg2.pool.SimpleConnectionPool(...) # 实际是DB连接池 -``` - -**实际功能**: PostgreSQL数据库连接池管理器 -**注释声称**: SMTP邮件发送服务 - -#### 示例 2: 反向的函数文档 - -```python -@app.route('/api/items', methods=['POST']) -def create_item(): - """Delete an existing item.""" # 完全相反! - data = request.get_json() - # 实际是创建新项目 - return jsonify({'item_id': item_id}), 201 -``` - -### 测试查询 (基于实际功能) - -| 查询 | 预期文件 | 查询难度 | -|------|---------|---------| -| "Hash passwords securely with bcrypt" | `crypto/hasher.py` | 高 - 注释说string utils | -| "Generate JWT authentication token" | `auth/token.py` | 中 - 无注释 | -| "Create user account REST API endpoint" | `api/handlers.py` | 高 - 注释说database | -| "Validate email address format" | `utils/checker.py` | 高 - 注释说math | -| "PostgreSQL database connection pool" | `db/pool.py` | 极高 - 注释说email | - ---- - -## 🔍 LLM分析能力验证 - -### 直接测试: LLM如何理解误导性代码 - -**测试代码**: `db/pool.py` (声称是"Email sending service") - -**Gemini分析结果**: - -``` -Summary: This Python module defines an `EmailSender` class that manages -a PostgreSQL connection pool for an email sending service, using -`psycopg2` for database interactions. It provides a context manager -`send_email` to handle connection acquisition, transaction commitment, -and release back to the pool. - -Purpose: data - -Keywords: psycopg2, connection pool, PostgreSQL, database, email sender, -context manager, python, database connection, transaction -``` - -**分析得分**: -- ✅ **正确识别的术语** (5/5): PostgreSQL, connection pool, database, psycopg2, database connection -- ⚠️ **误导性术语** (2/3): email sender, email sending service (但上下文正确) - -**结论**: LLM正确识别了实际功能(PostgreSQL connection pool),虽然摘要开头提到了错误的module docstring,但核心描述准确。 - ---- - -## 💡 关键发现 - -### 1. 为什么纯向量搜索也能工作? - -**原因**: 代码中的技术关键词权重高于注释 - -```python -# 这些强信号即使有错误注释也能正确匹配 -import bcrypt # 强信号: 密码哈希 -import jwt # 强信号: JWT令牌 -import psycopg2 # 强信号: PostgreSQL -from flask import Flask, request # 强信号: REST API -pattern = r'^[a-zA-Z0-9._%+-]+@' # 强信号: 邮箱验证 -``` - -**嵌入模型的优势**: -- 代码标识符(bcrypt, jwt, psycopg2)具有高度特异性 -- import语句权重高 -- 正则表达式模式具有语义信息 -- 框架API调用(Flask路由)提供明确上下文 - -### 2. LLM增强的价值 - -**LLM分析过程**: -1. ✅ 读取代码逻辑(不仅仅是注释) -2. ✅ 识别import语句和实际使用 -3. ✅ 理解代码流程和数据流 -4. ✅ 生成基于行为的摘要 -5. ⚠️ 部分参考错误注释(但不完全依赖) - -**示例对比**: - -| 方面 | 纯向量 | LLM增强 | -|------|--------|---------| -| **处理内容** | 代码 + 注释 (整体嵌入) | 代码分析 → 生成摘要 | -| **误导性注释影响** | 低 (代码关键词权重高) | 极低 (理解代码逻辑) | -| **自然语言查询** | 依赖代码词汇匹配 | 理解语义意图 | -| **处理速度** | 快 (2秒) | 慢 (104秒, 52倍差) | - -### 3. 测试数据集的局限性 - -**为什么两种方法都表现完美**: - -1. **文件数量太少** (5个文件) - - 没有相似功能的文件竞争 - - 每个查询有唯一的目标文件 - -2. **代码关键词太强** - - bcrypt → 唯一用于密码 - - jwt → 唯一用于令牌 - - Flask+@app.route → 唯一的API - - psycopg2 → 唯一的数据库 - -3. **查询过于具体** - - "bcrypt password hashing" 直接匹配代码关键词 - - 不是概念性或模糊查询 - -**理想的测试场景**: -- ❌ 5个唯一功能文件 -- ✅ 100+文件,多个相似功能模块 -- ✅ 模糊概念查询: "用户认证"而不是"bcrypt hash" -- ✅ 没有明显关键词的业务逻辑代码 - ---- - -## 🎯 实际应用建议 - -### 何时使用纯向量搜索 - -✅ **推荐场景**: -- 代码库有良好文档 -- 搜索代码模式和API使用 -- 已知技术栈关键词 -- 需要快速索引 - -**示例查询**: -- "bcrypt.hashpw usage" -- "Flask @app.route GET method" -- "jwt.encode algorithm" - -### 何时使用LLM增强搜索 - -✅ **推荐场景**: -- 代码库文档缺失或过时 -- 自然语言概念性查询 -- 业务逻辑搜索 -- 重视搜索准确性 > 索引速度 - -**示例查询**: -- "How to authenticate users?" (概念性) -- "Payment processing workflow" (业务逻辑) -- "Error handling for API requests" (模式搜索) - -### 混合策略 (推荐) - -| 模块类型 | 索引方式 | 原因 | -|---------|---------|------| -| **核心业务逻辑** | LLM增强 | 复杂逻辑,文档可能不完整 | -| **工具函数** | 纯向量 | 代码清晰,关键词明确 | -| **第三方集成** | 纯向量 | API调用已是最好描述 | -| **遗留代码** | LLM增强 | 文档陈旧或缺失 | - ---- - -## 📈 性能与成本 - -### 时间成本 - -| 操作 | 纯向量 | LLM增强 | 差异 | -|------|--------|---------|------| -| **索引5文件** | 2.1秒 | 103.7秒 | 49倍慢 | -| **索引100文件** | ~42秒 | ~35分钟 | ~50倍慢 | -| **查询速度** | ~50ms | ~50ms | 相同 | - -### 金钱成本 (Gemini Flash) - -- **价格**: $0.10 / 1M input tokens -- **平均**: ~500 tokens / 文件 -- **100文件**: $0.005 (半分钱) -- **1000文件**: $0.05 (5分钱) - -**结论**: 金钱成本可忽略,时间成本是主要考虑因素 - ---- - -## 🧪 测试工具 - -### 创建的脚本 - -1. **`scripts/test_misleading_comments.py`** - - 完整对比测试 - - 支持 `--tool gemini|qwen` - - 支持 `--keep-db` 保存结果数据库 - -2. **`scripts/show_llm_analysis.py`** - - 直接显示LLM对单个文件的分析 - - 评估LLM是否被误导 - - 计算正确/误导术语比例 - -3. **`scripts/inspect_llm_summaries.py`** - - 检查数据库中的LLM摘要 - - 查看metadata和keywords - -### 运行测试 - -```bash -# 完整对比测试 -python scripts/test_misleading_comments.py --tool gemini - -# 保存数据库用于检查 -python scripts/test_misleading_comments.py --keep-db ./results.db - -# 查看LLM对单个文件的分析 -python scripts/show_llm_analysis.py - -# 检查数据库中的摘要 -python scripts/inspect_llm_summaries.py results.db -``` - ---- - -## 📝 结论 - -### 测试结论 - -1. ✅ **LLM能够克服误导性注释** - - 正确识别实际代码功能 - - 生成基于行为的准确摘要 - - 不完全依赖文档字符串 - -2. ✅ **纯向量搜索也具有抗干扰能力** - - 代码关键词提供强信号 - - 技术栈名称具有高特异性 - - import语句和API调用信息丰富 - -3. ⚠️ **当前测试数据集太简单** - - 需要更大规模测试 (100+文件) - - 需要概念性查询测试 - - 需要相似功能模块对比 - -### 生产使用建议 - -**最佳实践**: 根据代码库特征选择策略 - -| 代码库特征 | 推荐方案 | 理由 | -|-----------|---------|------| -| 良好文档,清晰命名 | 纯向量 | 快速,成本低 | -| 文档缺失/陈旧 | LLM增强 | 理解代码逻辑 | -| 遗留系统 | LLM增强 | 克服历史包袱 | -| 新项目 | 纯向量 | 现代代码通常更清晰 | -| 大型企业代码库 | 混合 | 分模块策略 | - ---- - -**测试完成时间**: 2025-12-16 -**测试工具**: Gemini Flash 2.5, fastembed (BAAI/bge-small-en-v1.5) -**文档版本**: 1.0 diff --git a/codex-lens/scripts/compare_search_methods.py b/codex-lens/scripts/compare_search_methods.py deleted file mode 100644 index 12f39c11..00000000 --- a/codex-lens/scripts/compare_search_methods.py +++ /dev/null @@ -1,465 +0,0 @@ -#!/usr/bin/env python3 -"""Standalone script to compare pure vector vs LLM-enhanced semantic search. - -Usage: - python compare_search_methods.py [--tool gemini|qwen] [--skip-llm] - -This script: -1. Creates a test dataset with sample code -2. Tests pure vector search (code → fastembed → search) -3. Tests LLM-enhanced search (code → LLM summary → fastembed → search) -4. Compares results across natural language queries -""" - -import argparse -import sqlite3 -import sys -import tempfile -import time -from pathlib import Path -from typing import Dict, List, Tuple - -# Check dependencies -try: - from codexlens.semantic import SEMANTIC_AVAILABLE - from codexlens.semantic.embedder import Embedder - from codexlens.semantic.vector_store import VectorStore - from codexlens.semantic.chunker import Chunker, ChunkConfig - from codexlens.semantic.llm_enhancer import ( - LLMEnhancer, - LLMConfig, - FileData, - EnhancedSemanticIndexer, - ) - from codexlens.storage.dir_index import DirIndexStore - from codexlens.search.hybrid_search import HybridSearchEngine -except ImportError as e: - print(f"Error: Missing dependencies - {e}") - print("Install with: pip install codexlens[semantic]") - sys.exit(1) - -if not SEMANTIC_AVAILABLE: - print("Error: Semantic search dependencies not available") - print("Install with: pip install codexlens[semantic]") - sys.exit(1) - - -# Test dataset with realistic code samples -TEST_DATASET = { - "auth/password_hasher.py": '''"""Password hashing utilities using bcrypt.""" -import bcrypt - -def hash_password(password: str, salt_rounds: int = 12) -> str: - """Hash a password using bcrypt with specified salt rounds.""" - salt = bcrypt.gensalt(rounds=salt_rounds) - hashed = bcrypt.hashpw(password.encode('utf-8'), salt) - return hashed.decode('utf-8') - -def verify_password(password: str, hashed: str) -> bool: - """Verify a password against its hash.""" - return bcrypt.checkpw(password.encode('utf-8'), hashed.encode('utf-8')) -''', - - "auth/jwt_handler.py": '''"""JWT token generation and validation.""" -import jwt -from datetime import datetime, timedelta - -SECRET_KEY = "your-secret-key" - -def create_token(user_id: int, expires_in: int = 3600) -> str: - """Generate a JWT access token for user authentication.""" - payload = { - 'user_id': user_id, - 'exp': datetime.utcnow() + timedelta(seconds=expires_in), - 'iat': datetime.utcnow() - } - return jwt.encode(payload, SECRET_KEY, algorithm='HS256') - -def decode_token(token: str) -> dict: - """Validate and decode JWT token.""" - try: - return jwt.decode(token, SECRET_KEY, algorithms=['HS256']) - except jwt.ExpiredSignatureError: - return None -''', - - "api/user_endpoints.py": '''"""REST API endpoints for user management.""" -from flask import Flask, request, jsonify - -app = Flask(__name__) - -@app.route('/api/users', methods=['POST']) -def create_user(): - """Create a new user account with email and password.""" - data = request.get_json() - if not data.get('email') or not data.get('password'): - return jsonify({'error': 'Email and password required'}), 400 - user_id = 12345 # Database insert - return jsonify({'user_id': user_id, 'success': True}), 201 - -@app.route('/api/users/', methods=['GET']) -def get_user(user_id: int): - """Retrieve user profile information by user ID.""" - user = { - 'id': user_id, - 'email': 'user@example.com', - 'name': 'John Doe' - } - return jsonify(user), 200 -''', - - "utils/validation.py": '''"""Input validation utilities.""" -import re - -def validate_email(email: str) -> bool: - """Check if email address format is valid using regex.""" - pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' - return bool(re.match(pattern, email)) - -def sanitize_input(text: str, max_length: int = 255) -> str: - """Clean user input by removing special characters.""" - text = re.sub(r'[<>\"\'&]', '', text) - return text.strip()[:max_length] - -def validate_password_strength(password: str) -> tuple: - """Validate password meets security requirements.""" - if len(password) < 8: - return False, "Password must be at least 8 characters" - if not re.search(r'[A-Z]', password): - return False, "Must contain uppercase letter" - return True, None -''', - - "database/connection.py": '''"""Database connection pooling.""" -import psycopg2 -from psycopg2 import pool -from contextlib import contextmanager - -class DatabasePool: - """PostgreSQL connection pool manager.""" - - def __init__(self, min_conn: int = 1, max_conn: int = 10): - """Initialize database connection pool.""" - self.pool = psycopg2.pool.SimpleConnectionPool( - min_conn, max_conn, - user='dbuser', host='localhost', database='myapp' - ) - - @contextmanager - def get_connection(self): - """Get a connection from pool as context manager.""" - conn = self.pool.getconn() - try: - yield conn - conn.commit() - finally: - self.pool.putconn(conn) -''', -} - - -# Natural language test queries -TEST_QUERIES = [ - ("How do I securely hash passwords?", "auth/password_hasher.py"), - ("Generate JWT token for authentication", "auth/jwt_handler.py"), - ("Create new user account via API", "api/user_endpoints.py"), - ("Validate email address format", "utils/validation.py"), - ("Connect to PostgreSQL database", "database/connection.py"), -] - - -def create_test_database(db_path: Path) -> None: - """Create and populate test database.""" - store = DirIndexStore(db_path) - store.initialize() - - with store._get_connection() as conn: - for path, content in TEST_DATASET.items(): - name = path.split('/')[-1] - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, path, content, "python", 0.0) - ) - conn.commit() - - store.close() - - -def test_pure_vector_search(db_path: Path) -> Dict: - """Test pure vector search (raw code embeddings).""" - print("\n" + "="*70) - print("PURE VECTOR SEARCH (Code → fastembed)") - print("="*70) - - start_time = time.time() - - # Generate pure vector embeddings - embedder = Embedder(profile="code") - vector_store = VectorStore(db_path) - chunker = Chunker(config=ChunkConfig(max_chunk_size=2000)) - - with sqlite3.connect(db_path) as conn: - conn.row_factory = sqlite3.Row - rows = conn.execute("SELECT full_path, content FROM files").fetchall() - - chunk_count = 0 - for row in rows: - chunks = chunker.chunk_sliding_window( - row["content"], - file_path=row["full_path"], - language="python" - ) - for chunk in chunks: - chunk.embedding = embedder.embed_single(chunk.content) - chunk.metadata["strategy"] = "pure_vector" - if chunks: - vector_store.add_chunks(chunks, row["full_path"]) - chunk_count += len(chunks) - - setup_time = time.time() - start_time - print(f"Setup: {len(rows)} files, {chunk_count} chunks in {setup_time:.1f}s") - - # Test queries - engine = HybridSearchEngine() - results = {} - - print(f"\n{'Query':<45} {'Top Result':<30} {'Score':<8}") - print("-" * 70) - - for query, expected_file in TEST_QUERIES: - search_results = engine.search( - db_path, - query, - limit=3, - enable_vector=True, - pure_vector=True, - ) - - top_file = search_results[0].path if search_results else "No results" - top_score = search_results[0].score if search_results else 0.0 - found = expected_file in [r.path for r in search_results] - rank = None - if found: - for i, r in enumerate(search_results): - if r.path == expected_file: - rank = i + 1 - break - - status = "[OK]" if found and rank == 1 else ("[~]" if found else "[X]") - display_query = query[:42] + "..." if len(query) > 45 else query - display_file = top_file.split('/')[-1] if '/' in top_file else top_file - - print(f"{status} {display_query:<43} {display_file:<30} {top_score:.3f}") - - results[query] = { - "found": found, - "rank": rank, - "top_file": top_file, - "score": top_score, - } - - return results - - -def test_llm_enhanced_search(db_path: Path, llm_tool: str = "gemini") -> Dict: - """Test LLM-enhanced search (LLM summaries → fastembed).""" - print("\n" + "="*70) - print(f"LLM-ENHANCED SEARCH (Code → {llm_tool.upper()} → fastembed)") - print("="*70) - - # Check CCW availability - llm_config = LLMConfig(enabled=True, tool=llm_tool, batch_size=2) - enhancer = LLMEnhancer(llm_config) - - if not enhancer.check_available(): - print("[X] CCW CLI not available - skipping LLM-enhanced test") - print(" Install CCW: npm install -g ccw") - return {} - - start_time = time.time() - - # Generate LLM-enhanced embeddings - embedder = Embedder(profile="code") - vector_store = VectorStore(db_path) - indexer = EnhancedSemanticIndexer(enhancer, embedder, vector_store) - - # Prepare file data - file_data_list = [ - FileData(path=path, content=content, language="python") - for path, content in TEST_DATASET.items() - ] - - # Index with LLM enhancement - print(f"Generating LLM summaries for {len(file_data_list)} files...") - indexed = indexer.index_files(file_data_list) - setup_time = time.time() - start_time - - print(f"Setup: {indexed}/{len(file_data_list)} files indexed in {setup_time:.1f}s") - - # Test queries - engine = HybridSearchEngine() - results = {} - - print(f"\n{'Query':<45} {'Top Result':<30} {'Score':<8}") - print("-" * 70) - - for query, expected_file in TEST_QUERIES: - search_results = engine.search( - db_path, - query, - limit=3, - enable_vector=True, - pure_vector=True, - ) - - top_file = search_results[0].path if search_results else "No results" - top_score = search_results[0].score if search_results else 0.0 - found = expected_file in [r.path for r in search_results] - rank = None - if found: - for i, r in enumerate(search_results): - if r.path == expected_file: - rank = i + 1 - break - - status = "[OK]" if found and rank == 1 else ("[~]" if found else "[X]") - display_query = query[:42] + "..." if len(query) > 45 else query - display_file = top_file.split('/')[-1] if '/' in top_file else top_file - - print(f"{status} {display_query:<43} {display_file:<30} {top_score:.3f}") - - results[query] = { - "found": found, - "rank": rank, - "top_file": top_file, - "score": top_score, - } - - return results - - -def compare_results(pure_results: Dict, llm_results: Dict) -> None: - """Compare and analyze results from both approaches.""" - print("\n" + "="*70) - print("COMPARISON SUMMARY") - print("="*70) - - if not llm_results: - print("Cannot compare - LLM-enhanced test was skipped") - return - - pure_score = 0 - llm_score = 0 - - print(f"\n{'Query':<45} {'Pure':<10} {'LLM':<10}") - print("-" * 70) - - for query, expected_file in TEST_QUERIES: - pure_res = pure_results.get(query, {}) - llm_res = llm_results.get(query, {}) - - pure_status = f"[OK] Rank {pure_res.get('rank', '?')}" if pure_res.get('found') else "[X] Miss" - llm_status = f"[OK] Rank {llm_res.get('rank', '?')}" if llm_res.get('found') else "[X] Miss" - - # Scoring: Rank 1 = 3 points, Rank 2 = 2 points, Rank 3 = 1 point - if pure_res.get('found') and pure_res.get('rank'): - pure_score += max(0, 4 - pure_res['rank']) - if llm_res.get('found') and llm_res.get('rank'): - llm_score += max(0, 4 - llm_res['rank']) - - display_query = query[:42] + "..." if len(query) > 45 else query - print(f"{display_query:<45} {pure_status:<10} {llm_status:<10}") - - print("-" * 70) - print(f"{'TOTAL SCORE':<45} {pure_score:<10} {llm_score:<10}") - print("="*70) - - # Analysis - print("\nANALYSIS:") - if llm_score > pure_score: - improvement = ((llm_score - pure_score) / max(pure_score, 1)) * 100 - print(f"[OK] LLM enhancement improves results by {improvement:.1f}%") - print(" Natural language summaries match queries better than raw code") - elif pure_score > llm_score: - degradation = ((pure_score - llm_score) / max(pure_score, 1)) * 100 - print(f"[X] Pure vector performed {degradation:.1f}% better") - print(" LLM summaries may be too generic or missing key details") - else: - print("= Both approaches performed equally on this test set") - - print("\nKEY FINDINGS:") - print("- Pure Vector: Direct code embeddings, fast but may miss semantic intent") - print("- LLM Enhanced: Natural language summaries, better for human-like queries") - print("- Best Use: Combine both - LLM for natural language, vector for code patterns") - - -def main(): - parser = argparse.ArgumentParser( - description="Compare pure vector vs LLM-enhanced semantic search" - ) - parser.add_argument( - "--tool", - choices=["gemini", "qwen"], - default="gemini", - help="LLM tool to use for enhancement (default: gemini)" - ) - parser.add_argument( - "--skip-llm", - action="store_true", - help="Skip LLM-enhanced test (only run pure vector)" - ) - args = parser.parse_args() - - print("\n" + "="*70) - print("SEMANTIC SEARCH COMPARISON TEST") - print("Pure Vector vs LLM-Enhanced Vector Search") - print("="*70) - - # Create test database - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - try: - print(f"\nTest dataset: {len(TEST_DATASET)} Python files") - print(f"Test queries: {len(TEST_QUERIES)} natural language questions") - - create_test_database(db_path) - - # Test pure vector search - pure_results = test_pure_vector_search(db_path) - - # Test LLM-enhanced search - if not args.skip_llm: - # Clear semantic_chunks table for LLM test - with sqlite3.connect(db_path) as conn: - conn.execute("DELETE FROM semantic_chunks") - conn.commit() - - llm_results = test_llm_enhanced_search(db_path, args.tool) - else: - llm_results = {} - print("\n[X] LLM-enhanced test skipped (--skip-llm flag)") - - # Compare results - compare_results(pure_results, llm_results) - - finally: - # Cleanup - ensure all connections are closed - try: - import gc - gc.collect() # Force garbage collection to close any lingering connections - time.sleep(0.1) # Small delay for Windows to release file handle - if db_path.exists(): - db_path.unlink() - except PermissionError: - print(f"\nWarning: Could not delete temporary database: {db_path}") - print("It will be cleaned up on next system restart.") - - print("\n" + "="*70) - print("Test completed successfully!") - print("="*70) - - -if __name__ == "__main__": - main() diff --git a/codex-lens/scripts/inspect_llm_summaries.py b/codex-lens/scripts/inspect_llm_summaries.py deleted file mode 100644 index 26f46a6c..00000000 --- a/codex-lens/scripts/inspect_llm_summaries.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python3 -"""Inspect LLM-generated summaries in semantic_chunks table.""" - -import sqlite3 -import sys -from pathlib import Path - -def inspect_summaries(db_path: Path): - """Show LLM-generated summaries from database.""" - if not db_path.exists(): - print(f"Error: Database not found: {db_path}") - return - - with sqlite3.connect(db_path) as conn: - conn.row_factory = sqlite3.Row - - # Check if semantic_chunks table exists - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'" - ) - if not cursor.fetchone(): - print("No semantic_chunks table found") - return - - # Get all chunks with metadata - cursor = conn.execute(""" - SELECT file_path, chunk_index, content, - json_extract(metadata, '$.llm_summary') as summary, - json_extract(metadata, '$.llm_keywords') as keywords, - json_extract(metadata, '$.llm_purpose') as purpose, - json_extract(metadata, '$.strategy') as strategy - FROM semantic_chunks - ORDER BY file_path, chunk_index - """) - - chunks = cursor.fetchall() - - if not chunks: - print("No chunks found in database") - return - - print("="*80) - print("LLM-GENERATED SUMMARIES INSPECTION") - print("="*80) - - current_file = None - for chunk in chunks: - file_path = chunk['file_path'] - - if file_path != current_file: - print(f"\n{'='*80}") - print(f"FILE: {file_path}") - print(f"{'='*80}") - current_file = file_path - - print(f"\n[Chunk {chunk['chunk_index']}]") - print(f"Strategy: {chunk['strategy']}") - - if chunk['summary']: - print(f"\nLLM Summary:") - print(f" {chunk['summary']}") - - if chunk['keywords']: - print(f"\nKeywords:") - print(f" {chunk['keywords']}") - - if chunk['purpose']: - print(f"\nPurpose:") - print(f" {chunk['purpose']}") - - # Show first 200 chars of content - content = chunk['content'] - if len(content) > 200: - content = content[:200] + "..." - print(f"\nOriginal Content (first 200 chars):") - print(f" {content}") - print("-" * 80) - - -if __name__ == "__main__": - if len(sys.argv) < 2: - print("Usage: python inspect_llm_summaries.py ") - print("\nExample:") - print(" python inspect_llm_summaries.py ~/.codexlens/indexes/myproject/_index.db") - sys.exit(1) - - db_path = Path(sys.argv[1]) - inspect_summaries(db_path) diff --git a/codex-lens/scripts/show_llm_analysis.py b/codex-lens/scripts/show_llm_analysis.py deleted file mode 100644 index 066ce49f..00000000 --- a/codex-lens/scripts/show_llm_analysis.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python3 -"""Directly show LLM analysis of test code.""" - -from codexlens.semantic.llm_enhancer import LLMEnhancer, LLMConfig, FileData - -# Misleading code example -TEST_CODE = '''"""Email sending service.""" -import psycopg2 -from psycopg2 import pool -from contextlib import contextmanager - -class EmailSender: - """SMTP email sender with retry logic.""" - - def __init__(self, min_conn: int = 1, max_conn: int = 10): - """Initialize email sender.""" - self.pool = psycopg2.pool.SimpleConnectionPool( - min_conn, max_conn, - user='dbuser', host='localhost', database='myapp' - ) - - @contextmanager - def send_email(self): - """Send email message.""" - conn = self.pool.getconn() - try: - yield conn - conn.commit() - finally: - self.pool.putconn(conn) -''' - -print("="*80) -print("LLM ANALYSIS OF MISLEADING CODE") -print("="*80) - -print("\n[Original Code with Misleading Comments]") -print("-"*80) -print(TEST_CODE) -print("-"*80) - -print("\n[Actual Functionality]") -print(" - Imports: psycopg2 (PostgreSQL library)") -print(" - Class: EmailSender (but name is misleading!)") -print(" - Actually: Creates PostgreSQL connection pool") -print(" - Methods: send_email (actually gets DB connection)") - -print("\n[Misleading Documentation]") -print(" - Module docstring: 'Email sending service' (WRONG)") -print(" - Class docstring: 'SMTP email sender' (WRONG)") -print(" - Method docstring: 'Send email message' (WRONG)") - -print("\n" + "="*80) -print("TESTING LLM UNDERSTANDING") -print("="*80) - -# Test LLM analysis -config = LLMConfig(enabled=True, tool="gemini", batch_size=1) -enhancer = LLMEnhancer(config) - -if not enhancer.check_available(): - print("\n[X] CCW CLI not available") - print("Install: npm install -g ccw") - exit(1) - -print("\n[Calling Gemini to analyze code...]") -file_data = FileData(path="db/pool.py", content=TEST_CODE, language="python") - -import tempfile -from pathlib import Path - -with tempfile.TemporaryDirectory() as tmpdir: - result = enhancer.enhance_files([file_data], Path(tmpdir)) - - if "db/pool.py" in result: - metadata = result["db/pool.py"] - - print("\n[LLM-Generated Summary]") - print("-"*80) - print(f"Summary: {metadata.summary}") - print(f"\nPurpose: {metadata.purpose}") - print(f"\nKeywords: {', '.join(metadata.keywords)}") - print("-"*80) - - print("\n[Analysis]") - # Check if LLM identified the real functionality - summary_lower = metadata.summary.lower() - keywords_lower = [k.lower() for k in metadata.keywords] - - correct_terms = ['database', 'postgresql', 'connection', 'pool', 'psycopg'] - misleading_terms = ['email', 'smtp', 'send'] - - found_correct = sum(1 for term in correct_terms - if term in summary_lower or any(term in k for k in keywords_lower)) - found_misleading = sum(1 for term in misleading_terms - if term in summary_lower or any(term in k for k in keywords_lower)) - - print(f"Correct terms found: {found_correct}/{len(correct_terms)}") - print(f"Misleading terms found: {found_misleading}/{len(misleading_terms)}") - - if found_correct > found_misleading: - print("\n[OK] LLM correctly identified actual functionality!") - print(" LLM ignored misleading comments and analyzed code behavior") - elif found_misleading > found_correct: - print("\n[X] LLM was misled by incorrect comments") - print(" LLM trusted documentation over code analysis") - else: - print("\n[~] Mixed results - LLM found both correct and misleading terms") - else: - print("\n[X] LLM analysis failed - no results returned") - -print("\n" + "="*80) diff --git a/codex-lens/scripts/test_misleading_comments.py b/codex-lens/scripts/test_misleading_comments.py deleted file mode 100644 index 0ac34763..00000000 --- a/codex-lens/scripts/test_misleading_comments.py +++ /dev/null @@ -1,491 +0,0 @@ -#!/usr/bin/env python3 -"""Test pure vector vs LLM-enhanced search with misleading/missing comments. - -This test demonstrates how LLM enhancement can overcome: -1. Missing comments/docstrings -2. Misleading or incorrect comments -3. Outdated documentation - -Usage: - python test_misleading_comments.py --tool gemini -""" - -import argparse -import sqlite3 -import sys -import tempfile -import time -from pathlib import Path -from typing import Dict, List - -# Check dependencies -try: - from codexlens.semantic import SEMANTIC_AVAILABLE - from codexlens.semantic.embedder import Embedder - from codexlens.semantic.vector_store import VectorStore - from codexlens.semantic.chunker import Chunker, ChunkConfig - from codexlens.semantic.llm_enhancer import ( - LLMEnhancer, - LLMConfig, - FileData, - EnhancedSemanticIndexer, - ) - from codexlens.storage.dir_index import DirIndexStore - from codexlens.search.hybrid_search import HybridSearchEngine -except ImportError as e: - print(f"Error: Missing dependencies - {e}") - print("Install with: pip install codexlens[semantic]") - sys.exit(1) - -if not SEMANTIC_AVAILABLE: - print("Error: Semantic search dependencies not available") - sys.exit(1) - - -# Test dataset with MISLEADING or MISSING comments -MISLEADING_DATASET = { - "crypto/hasher.py": '''"""Simple string utilities.""" -import bcrypt - -def process_string(s: str, rounds: int = 12) -> str: - """Convert string to uppercase.""" - salt = bcrypt.gensalt(rounds=rounds) - hashed = bcrypt.hashpw(s.encode('utf-8'), salt) - return hashed.decode('utf-8') - -def check_string(s: str, target: str) -> bool: - """Check if two strings are equal.""" - return bcrypt.checkpw(s.encode('utf-8'), target.encode('utf-8')) -''', - - "auth/token.py": '''import jwt -from datetime import datetime, timedelta - -SECRET_KEY = "key123" - -def make_thing(uid: int, exp: int = 3600) -> str: - payload = { - 'user_id': uid, - 'exp': datetime.utcnow() + timedelta(seconds=exp), - 'iat': datetime.utcnow() - } - return jwt.encode(payload, SECRET_KEY, algorithm='HS256') - -def parse_thing(thing: str) -> dict: - try: - return jwt.decode(thing, SECRET_KEY, algorithms=['HS256']) - except jwt.ExpiredSignatureError: - return None -''', - - "api/handlers.py": '''"""Database connection utilities.""" -from flask import Flask, request, jsonify - -app = Flask(__name__) - -@app.route('/api/items', methods=['POST']) -def create_item(): - """Delete an existing item.""" - data = request.get_json() - if not data.get('email') or not data.get('password'): - return jsonify({'error': 'Missing data'}), 400 - item_id = 12345 - return jsonify({'item_id': item_id, 'success': True}), 201 - -@app.route('/api/items/', methods=['GET']) -def get_item(item_id: int): - """Update item configuration.""" - item = { - 'id': item_id, - 'email': 'user@example.com', - 'name': 'John Doe' - } - return jsonify(item), 200 -''', - - "utils/checker.py": '''"""Math calculation functions.""" -import re - -def calc_sum(email: str) -> bool: - pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$' - return bool(re.match(pattern, email)) - -def format_text(text: str, max_len: int = 255) -> str: - text = re.sub(r'[<>"\\'&]', '', text) - return text.strip()[:max_len] -''', - - "db/pool.py": '''"""Email sending service.""" -import psycopg2 -from psycopg2 import pool -from contextlib import contextmanager - -class EmailSender: - """SMTP email sender with retry logic.""" - - def __init__(self, min_conn: int = 1, max_conn: int = 10): - """Initialize email sender.""" - self.pool = psycopg2.pool.SimpleConnectionPool( - min_conn, max_conn, - user='dbuser', host='localhost', database='myapp' - ) - - @contextmanager - def send_email(self): - """Send email message.""" - conn = self.pool.getconn() - try: - yield conn - conn.commit() - finally: - self.pool.putconn(conn) -''', -} - - -# Test queries - natural language based on ACTUAL functionality (not misleading comments) -TEST_QUERIES = [ - ("How to hash passwords securely with bcrypt?", "crypto/hasher.py"), - ("Generate JWT authentication token", "auth/token.py"), - ("Create user account REST API endpoint", "api/handlers.py"), - ("Validate email address format", "utils/checker.py"), - ("PostgreSQL database connection pool", "db/pool.py"), -] - - -def create_test_database(db_path: Path) -> None: - """Create and populate test database.""" - store = DirIndexStore(db_path) - store.initialize() - - with store._get_connection() as conn: - for path, content in MISLEADING_DATASET.items(): - name = path.split('/')[-1] - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, path, content, "python", 0.0) - ) - conn.commit() - - store.close() - - -def test_pure_vector_search(db_path: Path) -> Dict: - """Test pure vector search (relies on code + misleading comments).""" - print("\n" + "="*70) - print("PURE VECTOR SEARCH (Code + Misleading Comments -> fastembed)") - print("="*70) - - start_time = time.time() - - # Generate pure vector embeddings - embedder = Embedder(profile="code") - vector_store = VectorStore(db_path) - chunker = Chunker(config=ChunkConfig(max_chunk_size=2000)) - - with sqlite3.connect(db_path) as conn: - conn.row_factory = sqlite3.Row - rows = conn.execute("SELECT full_path, content FROM files").fetchall() - - chunk_count = 0 - for row in rows: - chunks = chunker.chunk_sliding_window( - row["content"], - file_path=row["full_path"], - language="python" - ) - for chunk in chunks: - chunk.embedding = embedder.embed_single(chunk.content) - chunk.metadata["strategy"] = "pure_vector" - if chunks: - vector_store.add_chunks(chunks, row["full_path"]) - chunk_count += len(chunks) - - setup_time = time.time() - start_time - print(f"Setup: {len(rows)} files, {chunk_count} chunks in {setup_time:.1f}s") - print("Note: Embeddings include misleading comments") - - # Test queries - engine = HybridSearchEngine() - results = {} - - print(f"\n{'Query':<45} {'Top Result':<30} {'Score':<8}") - print("-" * 70) - - for query, expected_file in TEST_QUERIES: - search_results = engine.search( - db_path, - query, - limit=3, - enable_vector=True, - pure_vector=True, - ) - - top_file = search_results[0].path if search_results else "No results" - top_score = search_results[0].score if search_results else 0.0 - found = expected_file in [r.path for r in search_results] - rank = None - if found: - for i, r in enumerate(search_results): - if r.path == expected_file: - rank = i + 1 - break - - status = "[OK]" if found and rank == 1 else ("[~]" if found else "[X]") - display_query = query[:42] + "..." if len(query) > 45 else query - display_file = top_file.split('/')[-1] if '/' in top_file else top_file - - print(f"{status} {display_query:<43} {display_file:<30} {top_score:.3f}") - - results[query] = { - "found": found, - "rank": rank, - "top_file": top_file, - "score": top_score, - } - - return results - - -def test_llm_enhanced_search(db_path: Path, llm_tool: str = "gemini") -> Dict: - """Test LLM-enhanced search (LLM reads code and generates accurate summary).""" - print("\n" + "="*70) - print(f"LLM-ENHANCED SEARCH (Code -> {llm_tool.upper()} Analysis -> fastembed)") - print("="*70) - - # Check CCW availability - llm_config = LLMConfig(enabled=True, tool=llm_tool, batch_size=2) - enhancer = LLMEnhancer(llm_config) - - if not enhancer.check_available(): - print("[X] CCW CLI not available - skipping LLM-enhanced test") - print(" Install CCW: npm install -g ccw") - return {} - - start_time = time.time() - - # Generate LLM-enhanced embeddings - embedder = Embedder(profile="code") - vector_store = VectorStore(db_path) - indexer = EnhancedSemanticIndexer(enhancer, embedder, vector_store) - - # Prepare file data - file_data_list = [ - FileData(path=path, content=content, language="python") - for path, content in MISLEADING_DATASET.items() - ] - - # Index with LLM enhancement - print(f"LLM analyzing code (ignoring misleading comments)...") - indexed = indexer.index_files(file_data_list) - setup_time = time.time() - start_time - - print(f"Setup: {indexed}/{len(file_data_list)} files indexed in {setup_time:.1f}s") - print("Note: LLM generates summaries based on actual code logic") - - # Test queries - engine = HybridSearchEngine() - results = {} - - print(f"\n{'Query':<45} {'Top Result':<30} {'Score':<8}") - print("-" * 70) - - for query, expected_file in TEST_QUERIES: - search_results = engine.search( - db_path, - query, - limit=3, - enable_vector=True, - pure_vector=True, - ) - - top_file = search_results[0].path if search_results else "No results" - top_score = search_results[0].score if search_results else 0.0 - found = expected_file in [r.path for r in search_results] - rank = None - if found: - for i, r in enumerate(search_results): - if r.path == expected_file: - rank = i + 1 - break - - status = "[OK]" if found and rank == 1 else ("[~]" if found else "[X]") - display_query = query[:42] + "..." if len(query) > 45 else query - display_file = top_file.split('/')[-1] if '/' in top_file else top_file - - print(f"{status} {display_query:<43} {display_file:<30} {top_score:.3f}") - - results[query] = { - "found": found, - "rank": rank, - "top_file": top_file, - "score": top_score, - } - - return results - - -def compare_results(pure_results: Dict, llm_results: Dict) -> None: - """Compare and analyze results from both approaches.""" - print("\n" + "="*70) - print("COMPARISON SUMMARY - MISLEADING COMMENTS TEST") - print("="*70) - - if not llm_results: - print("Cannot compare - LLM-enhanced test was skipped") - return - - pure_score = 0 - llm_score = 0 - - print(f"\n{'Query':<45} {'Pure':<10} {'LLM':<10}") - print("-" * 70) - - for query, expected_file in TEST_QUERIES: - pure_res = pure_results.get(query, {}) - llm_res = llm_results.get(query, {}) - - pure_status = f"[OK] Rank {pure_res.get('rank', '?')}" if pure_res.get('found') else "[X] Miss" - llm_status = f"[OK] Rank {llm_res.get('rank', '?')}" if llm_res.get('found') else "[X] Miss" - - # Scoring: Rank 1 = 3 points, Rank 2 = 2 points, Rank 3 = 1 point - if pure_res.get('found') and pure_res.get('rank'): - pure_score += max(0, 4 - pure_res['rank']) - if llm_res.get('found') and llm_res.get('rank'): - llm_score += max(0, 4 - llm_res['rank']) - - display_query = query[:42] + "..." if len(query) > 45 else query - print(f"{display_query:<45} {pure_status:<10} {llm_status:<10}") - - print("-" * 70) - print(f"{'TOTAL SCORE':<45} {pure_score:<10} {llm_score:<10}") - print("="*70) - - # Analysis - print("\nANALYSIS:") - if llm_score > pure_score: - improvement = ((llm_score - pure_score) / max(pure_score, 1)) * 100 - print(f"[OK] LLM enhancement improves results by {improvement:.1f}%") - print(" LLM understands actual code logic despite misleading comments") - print(" Pure vector search misled by incorrect documentation") - elif pure_score > llm_score: - degradation = ((pure_score - llm_score) / max(pure_score, 1)) * 100 - print(f"[X] Pure vector performed {degradation:.1f}% better") - print(" Unexpected: Pure vector wasn't affected by misleading comments") - else: - print("= Both approaches performed equally") - print(" Test dataset may still be too simple to show differences") - - print("\nKEY INSIGHTS:") - print("- Pure Vector: Embeds code + comments together, can be misled") - print("- LLM Enhanced: Analyzes actual code behavior, ignores bad comments") - print("- Best Use: LLM enhancement crucial for poorly documented codebases") - - print("\nMISLEADING COMMENTS IN TEST:") - print("1. 'hasher.py' claims 'string utilities' but does bcrypt hashing") - print("2. 'token.py' has no docstrings, unclear function names") - print("3. 'handlers.py' says 'database utilities' but is REST API") - print("4. 'handlers.py' docstrings opposite (create says delete, etc)") - print("5. 'checker.py' claims 'math functions' but validates emails") - print("6. 'pool.py' claims 'email sender' but is database pool") - - -def main(): - parser = argparse.ArgumentParser( - description="Test pure vector vs LLM-enhanced with misleading comments" - ) - parser.add_argument( - "--tool", - choices=["gemini", "qwen"], - default="gemini", - help="LLM tool to use (default: gemini)" - ) - parser.add_argument( - "--skip-llm", - action="store_true", - help="Skip LLM-enhanced test" - ) - parser.add_argument( - "--keep-db", - type=str, - help="Save database to specified path for inspection (e.g., ./test_results.db)" - ) - args = parser.parse_args() - - print("\n" + "="*70) - print("MISLEADING COMMENTS TEST") - print("Pure Vector vs LLM-Enhanced with Incorrect Documentation") - print("="*70) - - # Create test database - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - try: - print(f"\nTest dataset: {len(MISLEADING_DATASET)} Python files") - print(f"Test queries: {len(TEST_QUERIES)} natural language questions") - print("\nChallenges:") - print("- Misleading module docstrings") - print("- Incorrect function docstrings") - print("- Missing documentation") - print("- Unclear function names") - - create_test_database(db_path) - - # Test pure vector search - pure_results = test_pure_vector_search(db_path) - - # Test LLM-enhanced search - if not args.skip_llm: - # Clear semantic_chunks table for LLM test - with sqlite3.connect(db_path) as conn: - conn.execute("DELETE FROM semantic_chunks") - conn.commit() - - llm_results = test_llm_enhanced_search(db_path, args.tool) - else: - llm_results = {} - print("\n[X] LLM-enhanced test skipped (--skip-llm flag)") - - # Compare results - compare_results(pure_results, llm_results) - - finally: - # Save or cleanup database - if args.keep_db: - import shutil - save_path = Path(args.keep_db) - try: - import gc - gc.collect() - time.sleep(0.2) - shutil.copy2(db_path, save_path) - print(f"\n[OK] Database saved to: {save_path}") - print(f"Inspect with: python scripts/inspect_llm_summaries.py {save_path}") - except Exception as e: - print(f"\n[X] Failed to save database: {e}") - finally: - try: - if db_path.exists(): - db_path.unlink() - except: - pass - else: - # Cleanup - try: - import gc - gc.collect() - time.sleep(0.1) - if db_path.exists(): - db_path.unlink() - except PermissionError: - print(f"\nWarning: Could not delete temporary database: {db_path}") - - print("\n" + "="*70) - print("Test completed!") - print("="*70) - - -if __name__ == "__main__": - main() diff --git a/codex-lens/src/codexlens/cli/commands.py b/codex-lens/src/codexlens/cli/commands.py index ab7948c9..c4bd341f 100644 --- a/codex-lens/src/codexlens/cli/commands.py +++ b/codex-lens/src/codexlens/cli/commands.py @@ -1047,184 +1047,6 @@ def migrate( registry.close() -@app.command() -def enhance( - path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to enhance."), - tool: str = typer.Option("gemini", "--tool", "-t", help="LLM tool to use (gemini or qwen)."), - batch_size: int = typer.Option(5, "--batch-size", "-b", min=1, max=20, help="Number of files to process per batch."), - force: bool = typer.Option(False, "--force", "-f", help="Regenerate metadata for all files, even if already exists."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Generate LLM-enhanced semantic metadata for indexed files. - - Uses CCW CLI to generate summaries, keywords, and purpose descriptions. - Requires ccw to be installed and accessible in PATH. - """ - _configure_logging(verbose) - base_path = path.expanduser().resolve() - - registry: RegistryStore | None = None - try: - # Check if ccw is available - import subprocess - import shutil - import sys - try: - ccw_cmd = shutil.which("ccw") - if not ccw_cmd: - raise FileNotFoundError("ccw not in PATH") - # On Windows, .cmd files need shell=True - if sys.platform == "win32": - subprocess.run("ccw --version", shell=True, capture_output=True, check=True) - else: - subprocess.run(["ccw", "--version"], capture_output=True, check=True) - except (subprocess.CalledProcessError, FileNotFoundError): - raise CodexLensError("ccw CLI not found. Please install ccw first.") - - # Validate tool - if tool not in ("gemini", "qwen"): - raise CodexLensError(f"Invalid tool: {tool}. Must be 'gemini' or 'qwen'.") - - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - # Find project - project_info = registry.get_project(base_path) - if not project_info: - raise CodexLensError(f"No index found for: {base_path}. Run 'codex-lens init' first.") - - # Import LLM enhancer - try: - from codexlens.semantic.llm_enhancer import LLMEnhancer, LLMConfig - except ImportError as e: - raise CodexLensError(f"Semantic enhancement requires additional dependencies: {e}") - - # Create enhancer with config - config = LLMConfig(tool=tool, batch_size=batch_size) - enhancer = LLMEnhancer(config=config) - - # Get index directory - index_dir = mapper.source_to_index_dir(base_path) - if not index_dir.exists(): - raise CodexLensError(f"Index directory not found: {index_dir}") - - # Process all index databases recursively - from codexlens.storage.dir_index import DirIndexStore - from pathlib import Path - - total_processed = 0 - total_errors = 0 - - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), - TimeElapsedColumn(), - console=console, - ) as progress: - # Find all _index.db files - index_files = list(index_dir.rglob("_index.db")) - task = progress.add_task(f"Enhancing {len(index_files)} directories...", total=len(index_files)) - - for db_path in index_files: - try: - store = DirIndexStore(db_path) - store.initialize() - - # Get files to process - if force: - files_to_process = store.list_files() - else: - files_to_process = store.get_files_without_semantic() - - if not files_to_process: - progress.update(task, advance=1) - continue - - # Process files - for file_entry in files_to_process: - try: - # Read file content - with open(file_entry.full_path, "r", encoding="utf-8", errors="ignore") as f: - content = f.read() - - # Generate metadata - metadata = enhancer.enhance_file( - path=str(file_entry.full_path), - content=content, - language=file_entry.language or "unknown" - ) - - # Store metadata - store.add_semantic_metadata( - file_id=file_entry.id, - summary=metadata.summary, - keywords=metadata.keywords, - purpose=metadata.purpose, - llm_tool=tool - ) - - total_processed += 1 - - except Exception as e: - total_errors += 1 - if verbose: - console.print(f"[yellow]Error processing {file_entry.full_path}: {e}[/yellow]") - - store.close() - - except Exception as e: - total_errors += 1 - if verbose: - console.print(f"[yellow]Error processing {db_path}: {e}[/yellow]") - - progress.update(task, advance=1) - - result = { - "path": str(base_path), - "tool": tool, - "files_processed": total_processed, - "errors": total_errors, - } - - if json_mode: - print_json(success=True, result=result) - else: - console.print(f"[green]Enhanced {total_processed} files using {tool}[/green]") - if total_errors > 0: - console.print(f" [yellow]Errors: {total_errors}[/yellow]") - - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Enhancement failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except PermissionError as exc: - if json_mode: - print_json(success=False, error=f"Permission denied: {exc}") - else: - console.print(f"[red]Enhancement failed (permission denied):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Enhancement failed:[/red] {exc}") - raise typer.Exit(code=1) - except Exception as exc: - if json_mode: - print_json(success=False, error=f"Unexpected error: {exc}") - else: - console.print(f"[red]Enhancement failed (unexpected):[/red] {exc}") - raise typer.Exit(code=1) - finally: - if registry is not None: - registry.close() - @app.command() def clean( path: Optional[Path] = typer.Argument(None, help="Project path to clean (removes project index)."), diff --git a/codex-lens/src/codexlens/semantic/__init__.py b/codex-lens/src/codexlens/semantic/__init__.py index 7b6c5ac9..57c0fcda 100644 --- a/codex-lens/src/codexlens/semantic/__init__.py +++ b/codex-lens/src/codexlens/semantic/__init__.py @@ -32,38 +32,8 @@ def check_semantic_available() -> tuple[bool, str | None]: """Check if semantic search dependencies are available.""" return SEMANTIC_AVAILABLE, _import_error -# Export LLM enhancement classes -try: - from .llm_enhancer import ( - LLMEnhancer, - LLMConfig, - SemanticMetadata, - FileData, - EnhancedSemanticIndexer, - create_enhancer, - create_enhanced_indexer, - ) - LLM_AVAILABLE = True -except ImportError: - LLM_AVAILABLE = False - LLMEnhancer = None # type: ignore - LLMConfig = None # type: ignore - SemanticMetadata = None # type: ignore - FileData = None # type: ignore - EnhancedSemanticIndexer = None # type: ignore - create_enhancer = None # type: ignore - create_enhanced_indexer = None # type: ignore - __all__ = [ "SEMANTIC_AVAILABLE", "SEMANTIC_BACKEND", "check_semantic_available", - "LLM_AVAILABLE", - "LLMEnhancer", - "LLMConfig", - "SemanticMetadata", - "FileData", - "EnhancedSemanticIndexer", - "create_enhancer", - "create_enhanced_indexer", ] diff --git a/codex-lens/src/codexlens/semantic/llm_enhancer.py b/codex-lens/src/codexlens/semantic/llm_enhancer.py deleted file mode 100644 index fe964a89..00000000 --- a/codex-lens/src/codexlens/semantic/llm_enhancer.py +++ /dev/null @@ -1,899 +0,0 @@ -"""LLM-based semantic enhancement using CCW CLI. - -This module provides LLM-generated descriptions that are then embedded -by fastembed for improved semantic search. The flow is: - - Code → LLM Summary → fastembed embedding → VectorStore → semantic search - -LLM-generated summaries match natural language queries better than raw code. -""" - -from __future__ import annotations - -import json -import logging -import os -import subprocess -import shutil -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any, Dict, List, Optional, TYPE_CHECKING - -from codexlens.entities import SemanticChunk, Symbol - -if TYPE_CHECKING: - from .embedder import Embedder - from .vector_store import VectorStore - - -logger = logging.getLogger(__name__) - - -@dataclass -class SemanticMetadata: - """LLM-generated semantic metadata for a file or symbol.""" - - summary: str - keywords: List[str] - purpose: str - file_path: Optional[str] = None - symbol_name: Optional[str] = None - llm_tool: Optional[str] = None - - -@dataclass -class FileData: - """File data for LLM processing.""" - - path: str - content: str - language: str - symbols: List[Symbol] = field(default_factory=list) - - -@dataclass -class LLMConfig: - """Configuration for LLM enhancement. - - Tool selection can be overridden via environment variables: - - CCW_CLI_SECONDARY_TOOL: Primary tool for LLM calls (default: gemini) - - CCW_CLI_FALLBACK_TOOL: Fallback tool if primary fails (default: qwen) - """ - - tool: str = field(default_factory=lambda: os.environ.get("CCW_CLI_SECONDARY_TOOL", "gemini")) - fallback_tool: str = field(default_factory=lambda: os.environ.get("CCW_CLI_FALLBACK_TOOL", "qwen")) - timeout_ms: int = 300000 - batch_size: int = 5 - max_content_chars: int = 8000 # Max chars per file in batch prompt - enabled: bool = True - - -class LLMEnhancer: - """LLM-based semantic enhancement using CCW CLI. - - Generates code summaries and search keywords by calling - external LLM tools (gemini, qwen) via CCW CLI subprocess. - """ - - CHUNK_REFINEMENT_PROMPT = '''PURPOSE: Identify optimal semantic split points in code chunk -TASK: -- Analyze the code structure to find natural semantic boundaries -- Identify logical groupings (functions, classes, related statements) -- Suggest split points that maintain semantic cohesion -MODE: analysis -EXPECTED: JSON format with split positions - -=== CODE CHUNK === -{code_chunk} - -=== OUTPUT FORMAT === -Return ONLY valid JSON (no markdown, no explanation): -{{ - "split_points": [ - {{ - "line": , - "reason": "brief reason for split (e.g., 'start of new function', 'end of class definition')" - }} - ] -}} - -Rules: -- Split at function/class/method boundaries -- Keep related code together (don't split mid-function) -- Aim for chunks between 500-2000 characters -- Return empty split_points if no good splits found''' - - PROMPT_TEMPLATE = '''PURPOSE: Generate semantic summaries and search keywords for code files -TASK: -- For each code block, generate a concise summary (1-2 sentences) -- Extract 5-10 relevant search keywords -- Identify the functional purpose/category -MODE: analysis -EXPECTED: JSON format output - -=== CODE BLOCKS === -{code_blocks} - -=== OUTPUT FORMAT === -Return ONLY valid JSON (no markdown, no explanation): -{{ - "files": {{ - "": {{ - "summary": "Brief description of what this code does", - "keywords": ["keyword1", "keyword2", ...], - "purpose": "category like: auth, api, util, ui, data, config, test" - }} - }} -}}''' - - def __init__(self, config: LLMConfig | None = None) -> None: - """Initialize LLM enhancer. - - Args: - config: LLM configuration, uses defaults if None - """ - self.config = config or LLMConfig() - self._ccw_available: Optional[bool] = None - - def check_available(self) -> bool: - """Check if CCW CLI tool is available.""" - if self._ccw_available is not None: - return self._ccw_available - - self._ccw_available = shutil.which("ccw") is not None - if not self._ccw_available: - logger.warning("CCW CLI not found in PATH, LLM enhancement disabled") - return self._ccw_available - - def enhance_files( - self, - files: List[FileData], - working_dir: Optional[Path] = None, - ) -> Dict[str, SemanticMetadata]: - """Enhance multiple files with LLM-generated semantic metadata. - - Processes files in batches to manage token limits and API costs. - - Args: - files: List of file data to process - working_dir: Optional working directory for CCW CLI - - Returns: - Dict mapping file paths to SemanticMetadata - """ - if not self.config.enabled: - logger.debug("LLM enhancement disabled by config") - return {} - - if not self.check_available(): - return {} - - if not files: - return {} - - results: Dict[str, SemanticMetadata] = {} - batch_size = self.config.batch_size - - for i in range(0, len(files), batch_size): - batch = files[i:i + batch_size] - try: - batch_results = self._process_batch(batch, working_dir) - results.update(batch_results) - logger.debug( - "Processed batch %d/%d: %d files enhanced", - i // batch_size + 1, - (len(files) + batch_size - 1) // batch_size, - len(batch_results), - ) - except Exception as e: - logger.warning( - "Batch %d failed, continuing: %s", - i // batch_size + 1, - e, - ) - continue - - return results - - def enhance_file( - - self, - - path: str, - - content: str, - - language: str, - - working_dir: Optional[Path] = None, - - ) -> SemanticMetadata: - - """Enhance a single file with LLM-generated semantic metadata. - - - - Convenience method that wraps enhance_files for single file processing. - - - - Args: - - path: File path - - content: File content - - language: Programming language - - working_dir: Optional working directory for CCW CLI - - - - Returns: - - SemanticMetadata for the file - - - - Raises: - - ValueError: If enhancement fails - - """ - - file_data = FileData(path=path, content=content, language=language) - - results = self.enhance_files([file_data], working_dir) - - - - if path not in results: - - # Return default metadata if enhancement failed - - return SemanticMetadata( - - summary=f"Code file written in {language}", - - keywords=[language, "code"], - - purpose="unknown", - - file_path=path, - - llm_tool=self.config.tool, - - ) - - - - return results[path] - - def refine_chunk_boundaries( - self, - chunk: SemanticChunk, - max_chunk_size: int = 2000, - working_dir: Optional[Path] = None, - ) -> List[SemanticChunk]: - """Refine chunk boundaries using LLM for large code chunks. - - Uses LLM to identify semantic split points in large chunks, - breaking them into smaller, more cohesive pieces. - - Args: - chunk: Original chunk to refine - max_chunk_size: Maximum characters before triggering refinement - working_dir: Optional working directory for CCW CLI - - Returns: - List of refined chunks (original chunk if no splits or refinement fails) - """ - # Skip if chunk is small enough - if len(chunk.content) <= max_chunk_size: - return [chunk] - - # Skip if LLM enhancement disabled or unavailable - if not self.config.enabled or not self.check_available(): - return [chunk] - - # Skip docstring chunks - only refine code chunks - if chunk.metadata.get("chunk_type") == "docstring": - return [chunk] - - try: - # Build refinement prompt - prompt = self.CHUNK_REFINEMENT_PROMPT.format(code_chunk=chunk.content) - - # Invoke LLM - result = self._invoke_ccw_cli( - prompt, - tool=self.config.tool, - working_dir=working_dir, - ) - - # Fallback if primary tool fails - if not result["success"] and self.config.fallback_tool: - result = self._invoke_ccw_cli( - prompt, - tool=self.config.fallback_tool, - working_dir=working_dir, - ) - - if not result["success"]: - logger.debug("LLM refinement failed, returning original chunk") - return [chunk] - - # Parse split points - split_points = self._parse_split_points(result["stdout"]) - if not split_points: - logger.debug("No split points identified, returning original chunk") - return [chunk] - - # Split chunk at identified boundaries - refined_chunks = self._split_chunk_at_points(chunk, split_points) - logger.debug( - "Refined chunk into %d smaller chunks (was %d chars)", - len(refined_chunks), - len(chunk.content), - ) - return refined_chunks - - except Exception as e: - logger.warning("Chunk refinement error: %s, returning original chunk", e) - return [chunk] - - def _parse_split_points(self, stdout: str) -> List[int]: - """Parse split points from LLM response. - - Args: - stdout: Raw stdout from CCW CLI - - Returns: - List of line numbers where splits should occur (sorted) - """ - # Extract JSON from response - json_str = self._extract_json(stdout) - if not json_str: - return [] - - try: - data = json.loads(json_str) - split_points_data = data.get("split_points", []) - - # Extract line numbers - lines = [] - for point in split_points_data: - if isinstance(point, dict) and "line" in point: - line_num = point["line"] - if isinstance(line_num, int) and line_num > 0: - lines.append(line_num) - - return sorted(set(lines)) - - except (json.JSONDecodeError, ValueError, TypeError) as e: - logger.debug("Failed to parse split points: %s", e) - return [] - - def _split_chunk_at_points( - self, - chunk: SemanticChunk, - split_points: List[int], - ) -> List[SemanticChunk]: - """Split chunk at specified line numbers. - - Args: - chunk: Original chunk to split - split_points: Sorted list of line numbers to split at - - Returns: - List of smaller chunks - """ - lines = chunk.content.splitlines(keepends=True) - chunks: List[SemanticChunk] = [] - - # Get original metadata - base_metadata = dict(chunk.metadata) - original_start = base_metadata.get("start_line", 1) - - # Add start and end boundaries - boundaries = [0] + split_points + [len(lines)] - - for i in range(len(boundaries) - 1): - start_idx = boundaries[i] - end_idx = boundaries[i + 1] - - # Skip empty sections - if start_idx >= end_idx: - continue - - # Extract content - section_lines = lines[start_idx:end_idx] - section_content = "".join(section_lines) - - # Skip if too small - if len(section_content.strip()) < 50: - continue - - # Create new chunk with updated metadata - new_metadata = base_metadata.copy() - new_metadata["start_line"] = original_start + start_idx - new_metadata["end_line"] = original_start + end_idx - 1 - new_metadata["refined_by_llm"] = True - new_metadata["original_chunk_size"] = len(chunk.content) - - chunks.append( - SemanticChunk( - content=section_content, - embedding=None, # Embeddings will be regenerated - metadata=new_metadata, - ) - ) - - # If no valid chunks created, return original - if not chunks: - return [chunk] - - return chunks - - - - - def _process_batch( - self, - files: List[FileData], - working_dir: Optional[Path] = None, - ) -> Dict[str, SemanticMetadata]: - """Process a single batch of files.""" - prompt = self._build_batch_prompt(files) - - # Try primary tool first - result = self._invoke_ccw_cli( - prompt, - tool=self.config.tool, - working_dir=working_dir, - ) - - # Fallback to secondary tool if primary fails - if not result["success"] and self.config.fallback_tool: - logger.debug( - "Primary tool %s failed, trying fallback %s", - self.config.tool, - self.config.fallback_tool, - ) - result = self._invoke_ccw_cli( - prompt, - tool=self.config.fallback_tool, - working_dir=working_dir, - ) - - if not result["success"]: - logger.warning("LLM call failed: %s", result.get("stderr", "unknown error")) - return {} - - return self._parse_response(result["stdout"], self.config.tool) - - def _build_batch_prompt(self, files: List[FileData]) -> str: - """Build prompt for batch processing.""" - code_blocks_parts: List[str] = [] - - for file_data in files: - # Truncate content if too long - content = file_data.content - if len(content) > self.config.max_content_chars: - content = content[:self.config.max_content_chars] + "\n... [truncated]" - - # Format code block - lang_hint = file_data.language or "text" - code_block = f'''[FILE: {file_data.path}] -```{lang_hint} -{content} -```''' - code_blocks_parts.append(code_block) - - code_blocks = "\n\n".join(code_blocks_parts) - return self.PROMPT_TEMPLATE.format(code_blocks=code_blocks) - - def _invoke_ccw_cli( - self, - prompt: str, - tool: str = "gemini", - working_dir: Optional[Path] = None, - ) -> Dict[str, Any]: - """Invoke CCW CLI tool via subprocess. - - Args: - prompt: The prompt to send to LLM - tool: Tool name (gemini, qwen, codex) - working_dir: Optional working directory - - Returns: - Dict with success, stdout, stderr, exit_code - """ - import sys - import os - - timeout_seconds = (self.config.timeout_ms / 1000) + 30 - - # Build base arguments - base_args = [ - "cli", "exec", - prompt, # Direct string argument - "--tool", tool, - "--mode", "analysis", - "--timeout", str(self.config.timeout_ms), - ] - if working_dir: - base_args.extend(["--cd", str(working_dir)]) - - try: - if sys.platform == "win32": - # On Windows, ccw is a .CMD wrapper that requires shell - # Instead, directly invoke node with the ccw.js script - ccw_path = shutil.which("ccw") - if ccw_path and ccw_path.lower().endswith(".cmd"): - # Find the ccw.js script location - npm_dir = Path(ccw_path).parent - ccw_js = npm_dir / "node_modules" / "ccw" / "bin" / "ccw.js" - if ccw_js.exists(): - cmd = ["node", str(ccw_js)] + base_args - else: - # Fallback to shell execution - cmd_str = "ccw " + " ".join(f'"{a}"' if " " in a else a for a in base_args) - result = subprocess.run( - cmd_str, shell=True, capture_output=True, text=True, - timeout=timeout_seconds, cwd=working_dir, - encoding="utf-8", errors="replace", - ) - return { - "success": result.returncode == 0, - "stdout": result.stdout, - "stderr": result.stderr, - "exit_code": result.returncode, - } - else: - cmd = ["ccw"] + base_args - else: - cmd = ["ccw"] + base_args - - result = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=timeout_seconds, - cwd=working_dir, - encoding="utf-8", - errors="replace", - ) - - return { - "success": result.returncode == 0, - "stdout": result.stdout, - "stderr": result.stderr, - "exit_code": result.returncode, - } - - except subprocess.TimeoutExpired: - logger.warning("CCW CLI timeout after %ds", self.config.timeout_ms / 1000) - return { - "success": False, - "stdout": "", - "stderr": "timeout", - "exit_code": -1, - } - except FileNotFoundError: - logger.warning("CCW CLI not found - ensure 'ccw' is in PATH") - return { - "success": False, - "stdout": "", - "stderr": "ccw command not found", - "exit_code": -1, - } - except Exception as e: - logger.warning("CCW CLI invocation failed: %s", e) - return { - "success": False, - "stdout": "", - "stderr": str(e), - "exit_code": -1, - } - - def _parse_response( - self, - stdout: str, - tool: str, - ) -> Dict[str, SemanticMetadata]: - """Parse LLM response into SemanticMetadata objects. - - Args: - stdout: Raw stdout from CCW CLI - tool: Tool name used for generation - - Returns: - Dict mapping file paths to SemanticMetadata - """ - results: Dict[str, SemanticMetadata] = {} - - # Extract JSON from response (may be wrapped in markdown or other text) - json_str = self._extract_json(stdout) - if not json_str: - logger.warning("No JSON found in LLM response") - return results - - try: - data = json.loads(json_str) - except json.JSONDecodeError as e: - logger.warning("Failed to parse LLM response JSON: %s", e) - return results - - # Handle expected format: {"files": {"path": {...}}} - files_data = data.get("files", data) - if not isinstance(files_data, dict): - logger.warning("Unexpected response format: expected dict") - return results - - for file_path, metadata in files_data.items(): - if not isinstance(metadata, dict): - continue - - try: - results[file_path] = SemanticMetadata( - summary=metadata.get("summary", ""), - keywords=metadata.get("keywords", []), - purpose=metadata.get("purpose", ""), - file_path=file_path, - llm_tool=tool, - ) - except Exception as e: - logger.debug("Failed to parse metadata for %s: %s", file_path, e) - continue - - return results - - def _extract_json(self, text: str) -> Optional[str]: - """Extract JSON object from text that may contain markdown or other content.""" - # Try to find JSON object boundaries - text = text.strip() - - # Remove markdown code blocks if present - if text.startswith("```"): - lines = text.split("\n") - # Remove first line (```json or ```) - lines = lines[1:] - # Find closing ``` - for i, line in enumerate(lines): - if line.strip() == "```": - lines = lines[:i] - break - text = "\n".join(lines) - - # Find JSON object - start = text.find("{") - if start == -1: - return None - - # Find matching closing brace - depth = 0 - end = start - for i, char in enumerate(text[start:], start): - if char == "{": - depth += 1 - elif char == "}": - depth -= 1 - if depth == 0: - end = i + 1 - break - - if depth != 0: - return None - - return text[start:end] - - -def create_enhancer( - tool: str = "gemini", - timeout_ms: int = 300000, - batch_size: int = 5, - enabled: bool = True, -) -> LLMEnhancer: - """Factory function to create LLM enhancer with custom config.""" - config = LLMConfig( - tool=tool, - timeout_ms=timeout_ms, - batch_size=batch_size, - enabled=enabled, - ) - return LLMEnhancer(config) - - -class EnhancedSemanticIndexer: - """Integrates LLM enhancement with fastembed vector search. - - Flow: - 1. Code files → LLM generates summaries/keywords - 2. Summaries → fastembed generates embeddings - 3. Embeddings → VectorStore for similarity search - - This produces better semantic search because: - - LLM summaries are natural language descriptions - - Natural language queries match summaries better than raw code - - Keywords expand search coverage - """ - - def __init__( - self, - enhancer: LLMEnhancer, - embedder: "Embedder", - vector_store: "VectorStore", - ) -> None: - """Initialize enhanced semantic indexer. - - Args: - enhancer: LLM enhancer for generating summaries - embedder: Fastembed embedder for vector generation - vector_store: Vector storage for similarity search - """ - self.enhancer = enhancer - self.embedder = embedder - self.vector_store = vector_store - - def index_files( - self, - files: List[FileData], - working_dir: Optional[Path] = None, - ) -> int: - """Index files with LLM-enhanced semantic search. - - Args: - files: List of file data to index - working_dir: Optional working directory for LLM calls - - Returns: - Number of files successfully indexed - """ - if not files: - return 0 - - # Step 1: Generate LLM summaries - logger.info("Generating LLM summaries for %d files...", len(files)) - metadata_map = self.enhancer.enhance_files(files, working_dir) - - if not metadata_map: - logger.warning("No LLM metadata generated, falling back to raw code") - return self._index_raw_code(files) - - # Step 2: Create semantic chunks from LLM summaries - chunks_to_embed: List[SemanticChunk] = [] - file_paths: List[str] = [] - - for file_data in files: - metadata = metadata_map.get(file_data.path) - if metadata: - # Use LLM-generated summary + keywords for embedding - embeddable_text = self._create_embeddable_text(metadata, file_data) - chunk = SemanticChunk( - content=embeddable_text, - embedding=None, - metadata={ - "file": file_data.path, - "language": file_data.language, - "summary": metadata.summary, - "keywords": metadata.keywords, - "purpose": metadata.purpose, - "llm_tool": metadata.llm_tool, - "strategy": "llm_enhanced", - }, - ) - else: - # Fallback: use truncated raw code - chunk = SemanticChunk( - content=file_data.content[:2000], - embedding=None, - metadata={ - "file": file_data.path, - "language": file_data.language, - "strategy": "raw_code", - }, - ) - - chunks_to_embed.append(chunk) - file_paths.append(file_data.path) - - # Step 3: Generate embeddings - logger.info("Generating embeddings for %d chunks...", len(chunks_to_embed)) - texts = [chunk.content for chunk in chunks_to_embed] - embeddings = self.embedder.embed(texts) - - # Step 4: Store in vector store - indexed_count = 0 - for chunk, embedding, file_path in zip(chunks_to_embed, embeddings, file_paths): - chunk.embedding = embedding - try: - self.vector_store.add_chunk(chunk, file_path) - indexed_count += 1 - except Exception as e: - logger.debug("Failed to store chunk for %s: %s", file_path, e) - - logger.info("Successfully indexed %d/%d files", indexed_count, len(files)) - return indexed_count - - def _create_embeddable_text( - self, - metadata: SemanticMetadata, - file_data: FileData, - ) -> str: - """Create text optimized for embedding from LLM metadata. - - Combines summary, keywords, and purpose into a single string - that will produce good semantic matches for natural language queries. - """ - parts = [] - - # Summary is the primary content - if metadata.summary: - parts.append(metadata.summary) - - # Purpose adds categorical context - if metadata.purpose: - parts.append(f"Category: {metadata.purpose}") - - # Keywords expand search coverage - if metadata.keywords: - parts.append(f"Keywords: {', '.join(metadata.keywords)}") - - # Add file name for context - parts.append(f"File: {Path(file_data.path).name}") - - return "\n".join(parts) - - def _index_raw_code(self, files: List[FileData]) -> int: - """Fallback: index raw code without LLM enhancement.""" - indexed_count = 0 - - for file_data in files: - # Truncate to reasonable size - content = file_data.content[:2000] - - chunk = SemanticChunk( - content=content, - embedding=None, - metadata={ - "file": file_data.path, - "language": file_data.language, - "strategy": "raw_code", - }, - ) - - try: - embedding = self.embedder.embed_single(content) - chunk.embedding = embedding - self.vector_store.add_chunk(chunk, file_data.path) - indexed_count += 1 - except Exception as e: - logger.debug("Failed to index %s: %s", file_data.path, e) - - return indexed_count - - -def create_enhanced_indexer( - vector_store_path: Path, - llm_tool: str = "gemini", - llm_enabled: bool = True, -) -> EnhancedSemanticIndexer: - """Factory function to create an enhanced semantic indexer. - - Args: - vector_store_path: Path for the vector store database - llm_tool: LLM tool to use (gemini, qwen) - llm_enabled: Whether to enable LLM enhancement - - Returns: - Configured EnhancedSemanticIndexer instance - """ - from .embedder import Embedder - from .vector_store import VectorStore - - enhancer = create_enhancer(tool=llm_tool, enabled=llm_enabled) - embedder = Embedder() - vector_store = VectorStore(vector_store_path) - - return EnhancedSemanticIndexer(enhancer, embedder, vector_store) diff --git a/codex-lens/tests/test_llm_enhanced_search.py b/codex-lens/tests/test_llm_enhanced_search.py deleted file mode 100644 index c5d3235c..00000000 --- a/codex-lens/tests/test_llm_enhanced_search.py +++ /dev/null @@ -1,545 +0,0 @@ -"""Test suite for comparing pure vector search vs LLM-enhanced vector search. - -This test demonstrates the difference between: -1. Pure vector search: Raw code → fastembed → vector search -2. LLM-enhanced search: Code → LLM summary → fastembed → vector search - -LLM-enhanced search should provide better semantic matches for natural language queries. -""" - -import pytest -import sqlite3 -import tempfile -from pathlib import Path -from typing import Dict, List - -from codexlens.search.hybrid_search import HybridSearchEngine -from codexlens.storage.dir_index import DirIndexStore - -# Check semantic dependencies -try: - from codexlens.semantic import SEMANTIC_AVAILABLE - from codexlens.semantic.embedder import Embedder - from codexlens.semantic.vector_store import VectorStore - from codexlens.semantic.chunker import Chunker, ChunkConfig - from codexlens.semantic.llm_enhancer import ( - LLMEnhancer, - LLMConfig, - FileData, - EnhancedSemanticIndexer, - SemanticChunk, - ) - from codexlens.entities import SearchResult -except ImportError: - SEMANTIC_AVAILABLE = False - - -# Test code samples representing different functionality -TEST_CODE_SAMPLES = { - "auth/password_hasher.py": '''"""Password hashing utilities using bcrypt.""" -import bcrypt - -def hash_password(password: str, salt_rounds: int = 12) -> str: - """Hash a password using bcrypt with specified salt rounds. - - Args: - password: Plain text password to hash - salt_rounds: Number of salt rounds (default 12) - - Returns: - Hashed password string - """ - salt = bcrypt.gensalt(rounds=salt_rounds) - hashed = bcrypt.hashpw(password.encode('utf-8'), salt) - return hashed.decode('utf-8') - -def verify_password(password: str, hashed: str) -> bool: - """Verify a password against its hash. - - Args: - password: Plain text password to verify - hashed: Previously hashed password - - Returns: - True if password matches hash - """ - return bcrypt.checkpw(password.encode('utf-8'), hashed.encode('utf-8')) -''', - - "auth/jwt_handler.py": '''"""JWT token generation and validation.""" -import jwt -from datetime import datetime, timedelta -from typing import Dict, Optional - -SECRET_KEY = "your-secret-key-here" - -def create_token(user_id: int, expires_in: int = 3600) -> str: - """Generate a JWT access token for user authentication. - - Args: - user_id: User ID to encode in token - expires_in: Token expiration in seconds (default 1 hour) - - Returns: - JWT token string - """ - payload = { - 'user_id': user_id, - 'exp': datetime.utcnow() + timedelta(seconds=expires_in), - 'iat': datetime.utcnow() - } - return jwt.encode(payload, SECRET_KEY, algorithm='HS256') - -def decode_token(token: str) -> Optional[Dict]: - """Validate and decode JWT token to extract user information. - - Args: - token: JWT token string to decode - - Returns: - Decoded payload dict or None if invalid - """ - try: - payload = jwt.decode(token, SECRET_KEY, algorithms=['HS256']) - return payload - except jwt.ExpiredSignatureError: - return None - except jwt.InvalidTokenError: - return None -''', - - "api/user_endpoints.py": '''"""REST API endpoints for user management.""" -from flask import Flask, request, jsonify -from typing import Dict - -app = Flask(__name__) - -@app.route('/api/users', methods=['POST']) -def create_user(): - """Create a new user account with email and password. - - Request JSON: - email: User email address - password: User password - name: User full name - - Returns: - JSON with user_id and success status - """ - data = request.get_json() - # Validate input - if not data.get('email') or not data.get('password'): - return jsonify({'error': 'Email and password required'}), 400 - - # Create user (simplified) - user_id = 12345 # Would normally insert into database - return jsonify({'user_id': user_id, 'success': True}), 201 - -@app.route('/api/users/', methods=['GET']) -def get_user(user_id: int): - """Retrieve user profile information by user ID. - - Args: - user_id: Unique user identifier - - Returns: - JSON with user profile data - """ - # Simplified user retrieval - user = { - 'id': user_id, - 'email': 'user@example.com', - 'name': 'John Doe', - 'created_at': '2024-01-01' - } - return jsonify(user), 200 -''', - - "utils/validation.py": '''"""Input validation and sanitization utilities.""" -import re -from typing import Optional - -def validate_email(email: str) -> bool: - """Check if email address format is valid using regex pattern. - - Args: - email: Email address string to validate - - Returns: - True if email format is valid - """ - pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' - return bool(re.match(pattern, email)) - -def sanitize_input(text: str, max_length: int = 255) -> str: - """Clean user input by removing special characters and limiting length. - - Args: - text: Input text to sanitize - max_length: Maximum allowed length - - Returns: - Sanitized text string - """ - # Remove special characters - text = re.sub(r'[<>\"\'&]', '', text) - # Trim whitespace - text = text.strip() - # Limit length - return text[:max_length] - -def validate_password_strength(password: str) -> tuple[bool, Optional[str]]: - """Validate password meets security requirements. - - Requirements: - - At least 8 characters - - Contains uppercase and lowercase - - Contains numbers - - Contains special characters - - Args: - password: Password string to validate - - Returns: - Tuple of (is_valid, error_message) - """ - if len(password) < 8: - return False, "Password must be at least 8 characters" - if not re.search(r'[A-Z]', password): - return False, "Password must contain uppercase letter" - if not re.search(r'[a-z]', password): - return False, "Password must contain lowercase letter" - if not re.search(r'[0-9]', password): - return False, "Password must contain number" - if not re.search(r'[!@#$%^&*(),.?":{}|<>]', password): - return False, "Password must contain special character" - return True, None -''', - - "database/connection.py": '''"""Database connection pooling and management.""" -import psycopg2 -from psycopg2 import pool -from typing import Optional -from contextlib import contextmanager - -class DatabasePool: - """PostgreSQL connection pool manager for handling multiple concurrent connections.""" - - def __init__(self, min_conn: int = 1, max_conn: int = 10): - """Initialize database connection pool. - - Args: - min_conn: Minimum number of connections to maintain - max_conn: Maximum number of connections allowed - """ - self.pool = psycopg2.pool.SimpleConnectionPool( - min_conn, - max_conn, - user='dbuser', - password='dbpass', - host='localhost', - port='5432', - database='myapp' - ) - - @contextmanager - def get_connection(self): - """Get a connection from pool as context manager. - - Yields: - Database connection object - """ - conn = self.pool.getconn() - try: - yield conn - conn.commit() - except Exception: - conn.rollback() - raise - finally: - self.pool.putconn(conn) - - def close_all(self): - """Close all connections in pool.""" - self.pool.closeall() -''' -} - - -# Natural language queries to test semantic understanding -TEST_QUERIES = [ - { - "query": "How do I securely hash passwords?", - "expected_file": "auth/password_hasher.py", - "description": "Should find password hashing implementation", - }, - { - "query": "Generate JWT token for user authentication", - "expected_file": "auth/jwt_handler.py", - "description": "Should find JWT token creation logic", - }, - { - "query": "Create new user account via REST API", - "expected_file": "api/user_endpoints.py", - "description": "Should find user registration endpoint", - }, - { - "query": "Validate email address format", - "expected_file": "utils/validation.py", - "description": "Should find email validation function", - }, - { - "query": "Connect to PostgreSQL database", - "expected_file": "database/connection.py", - "description": "Should find database connection management", - }, - { - "query": "Check password complexity requirements", - "expected_file": "utils/validation.py", - "description": "Should find password strength validation", - }, -] - - -@pytest.mark.skipif(not SEMANTIC_AVAILABLE, reason="Semantic dependencies not available") -class TestPureVectorSearch: - """Test pure vector search (code → fastembed → search).""" - - @pytest.fixture - def pure_vector_db(self): - """Create database with pure vector embeddings (no LLM).""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - # Initialize database - store = DirIndexStore(db_path) - store.initialize() - - # Add test files - with store._get_connection() as conn: - for path, content in TEST_CODE_SAMPLES.items(): - name = path.split('/')[-1] - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, path, content, "python", 0.0) - ) - conn.commit() - - # Generate embeddings using pure vector approach (raw code) - embedder = Embedder(profile="code") - vector_store = VectorStore(db_path) - chunker = Chunker(config=ChunkConfig(max_chunk_size=2000)) - - with sqlite3.connect(db_path) as conn: - conn.row_factory = sqlite3.Row - rows = conn.execute("SELECT full_path, content FROM files").fetchall() - - for row in rows: - # Pure vector: directly chunk and embed raw code - chunks = chunker.chunk_sliding_window( - row["content"], - file_path=row["full_path"], - language="python" - ) - for chunk in chunks: - chunk.embedding = embedder.embed_single(chunk.content) - chunk.metadata["strategy"] = "pure_vector" - if chunks: - vector_store.add_chunks(chunks, row["full_path"]) - - yield db_path - store.close() - if db_path.exists(): - db_path.unlink() - - def test_pure_vector_queries(self, pure_vector_db): - """Test natural language queries with pure vector search.""" - engine = HybridSearchEngine() - results = {} - - for test_case in TEST_QUERIES: - query = test_case["query"] - expected_file = test_case["expected_file"] - - search_results = engine.search( - pure_vector_db, - query, - limit=5, - enable_vector=True, - pure_vector=True, - ) - - # Check if expected file is in top 3 results - top_files = [r.path for r in search_results[:3]] - found = expected_file in top_files - rank = top_files.index(expected_file) + 1 if found else None - - results[query] = { - "found": found, - "rank": rank, - "top_result": search_results[0].path if search_results else None, - "top_score": search_results[0].score if search_results else 0.0, - } - - return results - - -@pytest.mark.skipif(not SEMANTIC_AVAILABLE, reason="Semantic dependencies not available") -class TestLLMEnhancedSearch: - """Test LLM-enhanced vector search (code → LLM → fastembed → search).""" - - @pytest.fixture - def llm_enhanced_db(self): - """Create database with LLM-enhanced embeddings.""" - # Skip if CCW not available - llm_config = LLMConfig(enabled=True, tool="gemini") - enhancer = LLMEnhancer(llm_config) - if not enhancer.check_available(): - pytest.skip("CCW CLI not available for LLM enhancement") - - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - # Initialize database - store = DirIndexStore(db_path) - store.initialize() - - # Add test files - with store._get_connection() as conn: - for path, content in TEST_CODE_SAMPLES.items(): - name = path.split('/')[-1] - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, path, content, "python", 0.0) - ) - conn.commit() - - # Generate embeddings using LLM-enhanced approach - embedder = Embedder(profile="code") - vector_store = VectorStore(db_path) - - # Create enhanced indexer - indexer = EnhancedSemanticIndexer(enhancer, embedder, vector_store) - - # Prepare file data - file_data_list = [ - FileData(path=path, content=content, language="python") - for path, content in TEST_CODE_SAMPLES.items() - ] - - # Index with LLM enhancement - indexed = indexer.index_files(file_data_list) - print(f"\nLLM-enhanced indexing: {indexed}/{len(file_data_list)} files") - - yield db_path - store.close() - if db_path.exists(): - db_path.unlink() - - def test_llm_enhanced_queries(self, llm_enhanced_db): - """Test natural language queries with LLM-enhanced search.""" - engine = HybridSearchEngine() - results = {} - - for test_case in TEST_QUERIES: - query = test_case["query"] - expected_file = test_case["expected_file"] - - search_results = engine.search( - llm_enhanced_db, - query, - limit=5, - enable_vector=True, - pure_vector=True, - ) - - # Check if expected file is in top 3 results - top_files = [r.path for r in search_results[:3]] - found = expected_file in top_files - rank = top_files.index(expected_file) + 1 if found else None - - results[query] = { - "found": found, - "rank": rank, - "top_result": search_results[0].path if search_results else None, - "top_score": search_results[0].score if search_results else 0.0, - } - - return results - - -@pytest.mark.skipif(not SEMANTIC_AVAILABLE, reason="Semantic dependencies not available") -class TestSearchComparison: - """Compare pure vector vs LLM-enhanced search side-by-side.""" - - def test_comparison(self): - """Run comprehensive comparison of both approaches.""" - # This test runs both approaches and compares results - print("\n" + "="*70) - print("SEMANTIC SEARCH COMPARISON TEST") - print("="*70) - - try: - # Test pure vector search - print("\n1. Testing Pure Vector Search (Code → fastembed)") - print("-" * 70) - pure_test = TestPureVectorSearch() - pure_db = next(pure_test.pure_vector_db()) - pure_results = pure_test.test_pure_vector_queries(pure_db) - - # Test LLM-enhanced search - print("\n2. Testing LLM-Enhanced Search (Code → LLM → fastembed)") - print("-" * 70) - llm_test = TestLLMEnhancedSearch() - llm_db = next(llm_test.llm_enhanced_db()) - llm_results = llm_test.test_llm_enhanced_queries(llm_db) - - # Compare results - print("\n3. COMPARISON RESULTS") - print("="*70) - print(f"{'Query':<50} {'Pure Vec':<12} {'LLM Enhanced':<12}") - print("-" * 70) - - pure_score = 0 - llm_score = 0 - - for test_case in TEST_QUERIES: - query = test_case["query"][:47] + "..." if len(test_case["query"]) > 50 else test_case["query"] - - pure_res = pure_results.get(test_case["query"], {}) - llm_res = llm_results.get(test_case["query"], {}) - - pure_status = f"[OK] Rank {pure_res.get('rank', '?')}" if pure_res.get('found') else "[X] Not found" - llm_status = f"[OK] Rank {llm_res.get('rank', '?')}" if llm_res.get('found') else "[X] Not found" - - print(f"{query:<50} {pure_status:<12} {llm_status:<12}") - - if pure_res.get('found'): - pure_score += (4 - pure_res['rank']) # 3 points for rank 1, 2 for rank 2, etc - if llm_res.get('found'): - llm_score += (4 - llm_res['rank']) - - print("-" * 70) - print(f"{'TOTAL SCORE':<50} {pure_score:<12} {llm_score:<12}") - print("="*70) - - # Interpretation - print("\nINTERPRETATION:") - if llm_score > pure_score: - improvement = ((llm_score - pure_score) / max(pure_score, 1)) * 100 - print(f"[OK] LLM enhancement improves results by {improvement:.1f}%") - print(" LLM summaries match natural language queries better than raw code") - elif pure_score > llm_score: - print("[X] Pure vector search performed better (unexpected)") - print(" This may indicate LLM summaries are too generic") - else: - print("= Both approaches performed equally") - - except Exception as e: - pytest.fail(f"Comparison test failed: {e}") - - -if __name__ == "__main__": - pytest.main([__file__, "-v", "-s"]) diff --git a/codex-lens/tests/test_llm_enhancer.py b/codex-lens/tests/test_llm_enhancer.py deleted file mode 100644 index de5c8f97..00000000 --- a/codex-lens/tests/test_llm_enhancer.py +++ /dev/null @@ -1,1344 +0,0 @@ -"""Tests for LLM-based semantic enhancement functionality. - -Tests cover: -- LLMConfig and data classes -- LLMEnhancer initialization and configuration -- Prompt building and JSON parsing -- Batch processing logic -- CCW CLI invocation (mocked) -- EnhancedSemanticIndexer integration -- Error handling and fallback behavior -""" - -import json -import tempfile -from pathlib import Path -from typing import Dict, Any -from unittest.mock import MagicMock, patch, PropertyMock - -import pytest - -from codexlens.entities import SemanticChunk, Symbol -from codexlens.semantic.llm_enhancer import ( - SemanticMetadata, - FileData, - LLMConfig, - LLMEnhancer, - EnhancedSemanticIndexer, - create_enhancer, - create_enhanced_indexer, -) - - -# === Data Class Tests === - -class TestSemanticMetadata: - """Tests for SemanticMetadata dataclass.""" - - def test_basic_creation(self): - """Test creating SemanticMetadata with required fields.""" - metadata = SemanticMetadata( - summary="Authentication handler", - keywords=["auth", "login", "jwt"], - purpose="auth", - ) - assert metadata.summary == "Authentication handler" - assert metadata.keywords == ["auth", "login", "jwt"] - assert metadata.purpose == "auth" - assert metadata.file_path is None - assert metadata.symbol_name is None - assert metadata.llm_tool is None - - def test_full_creation(self): - """Test creating SemanticMetadata with all fields.""" - metadata = SemanticMetadata( - summary="User login function", - keywords=["login", "user"], - purpose="auth", - file_path="/test/auth.py", - symbol_name="login", - llm_tool="gemini", - ) - assert metadata.file_path == "/test/auth.py" - assert metadata.symbol_name == "login" - assert metadata.llm_tool == "gemini" - - def test_empty_keywords(self): - """Test creating SemanticMetadata with empty keywords.""" - metadata = SemanticMetadata( - summary="Empty", - keywords=[], - purpose="", - ) - assert metadata.keywords == [] - - -class TestFileData: - """Tests for FileData dataclass.""" - - def test_basic_creation(self): - """Test creating FileData with required fields.""" - data = FileData( - path="/test/file.py", - content="def hello(): pass", - language="python", - ) - assert data.path == "/test/file.py" - assert data.content == "def hello(): pass" - assert data.language == "python" - assert data.symbols == [] - - def test_with_symbols(self): - """Test creating FileData with symbols.""" - symbols = [ - Symbol(name="hello", kind="function", range=(1, 1)), - Symbol(name="MyClass", kind="class", range=(3, 10)), - ] - data = FileData( - path="/test/file.py", - content="code", - language="python", - symbols=symbols, - ) - assert len(data.symbols) == 2 - assert data.symbols[0].name == "hello" - - -class TestLLMConfig: - """Tests for LLMConfig dataclass.""" - - def test_default_values(self): - """Test default configuration values.""" - config = LLMConfig() - assert config.tool == "gemini" - assert config.fallback_tool == "qwen" - assert config.timeout_ms == 300000 - assert config.batch_size == 5 - assert config.max_content_chars == 8000 - assert config.enabled is True - - def test_custom_values(self): - """Test custom configuration values.""" - config = LLMConfig( - tool="qwen", - fallback_tool="gemini", - timeout_ms=600000, - batch_size=10, - max_content_chars=4000, - enabled=False, - ) - assert config.tool == "qwen" - assert config.fallback_tool == "gemini" - assert config.timeout_ms == 600000 - assert config.batch_size == 10 - assert config.max_content_chars == 4000 - assert config.enabled is False - - @patch.dict("os.environ", {"CCW_CLI_SECONDARY_TOOL": "codex", "CCW_CLI_FALLBACK_TOOL": "gemini"}) - def test_env_override(self): - """Test environment variable override.""" - config = LLMConfig() - assert config.tool == "codex" - assert config.fallback_tool == "gemini" - - -# === LLMEnhancer Tests === - -class TestLLMEnhancerInit: - """Tests for LLMEnhancer initialization.""" - - def test_default_init(self): - """Test default initialization.""" - enhancer = LLMEnhancer() - assert enhancer.config is not None - assert enhancer.config.tool == "gemini" - assert enhancer._ccw_available is None - - def test_custom_config(self): - """Test initialization with custom config.""" - config = LLMConfig(tool="qwen", batch_size=3) - enhancer = LLMEnhancer(config) - assert enhancer.config.tool == "qwen" - assert enhancer.config.batch_size == 3 - - -class TestLLMEnhancerAvailability: - """Tests for CCW CLI availability check.""" - - @patch("shutil.which") - def test_ccw_available(self, mock_which): - """Test CCW available returns True.""" - mock_which.return_value = "/usr/bin/ccw" - enhancer = LLMEnhancer() - - result = enhancer.check_available() - - assert result is True - assert enhancer._ccw_available is True - mock_which.assert_called_with("ccw") - - @patch("shutil.which") - def test_ccw_not_available(self, mock_which): - """Test CCW not available returns False.""" - mock_which.return_value = None - enhancer = LLMEnhancer() - - result = enhancer.check_available() - - assert result is False - assert enhancer._ccw_available is False - - @patch("shutil.which") - def test_ccw_availability_cached(self, mock_which): - """Test availability result is cached.""" - mock_which.return_value = "/usr/bin/ccw" - enhancer = LLMEnhancer() - - # First call - enhancer.check_available() - # Second call - enhancer.check_available() - - # which should only be called once - mock_which.assert_called_once() - - -class TestPromptBuilding: - """Tests for prompt building.""" - - def test_build_single_file_prompt(self): - """Test prompt building with single file.""" - enhancer = LLMEnhancer() - files = [ - FileData( - path="/test/auth.py", - content="def login(): pass", - language="python", - ) - ] - - prompt = enhancer._build_batch_prompt(files) - - assert "[FILE: /test/auth.py]" in prompt - assert "```python" in prompt - assert "def login(): pass" in prompt - assert "PURPOSE:" in prompt - assert "JSON format output" in prompt - - def test_build_multiple_files_prompt(self): - """Test prompt building with multiple files.""" - enhancer = LLMEnhancer() - files = [ - FileData(path="/test/a.py", content="def a(): pass", language="python"), - FileData(path="/test/b.js", content="function b() {}", language="javascript"), - ] - - prompt = enhancer._build_batch_prompt(files) - - assert "[FILE: /test/a.py]" in prompt - assert "[FILE: /test/b.js]" in prompt - assert "```python" in prompt - assert "```javascript" in prompt - - def test_build_prompt_truncates_long_content(self): - """Test prompt truncates long content.""" - config = LLMConfig(max_content_chars=100) - enhancer = LLMEnhancer(config) - - long_content = "x" * 200 - files = [FileData(path="/test/long.py", content=long_content, language="python")] - - prompt = enhancer._build_batch_prompt(files) - - assert "... [truncated]" in prompt - assert "x" * 200 not in prompt - - -class TestJSONParsing: - """Tests for JSON response parsing.""" - - def test_parse_valid_response(self): - """Test parsing valid JSON response.""" - enhancer = LLMEnhancer() - response = json.dumps({ - "files": { - "/test/auth.py": { - "summary": "Authentication handler", - "keywords": ["auth", "login"], - "purpose": "auth", - } - } - }) - - result = enhancer._parse_response(response, "gemini") - - assert "/test/auth.py" in result - assert result["/test/auth.py"].summary == "Authentication handler" - assert result["/test/auth.py"].keywords == ["auth", "login"] - assert result["/test/auth.py"].purpose == "auth" - assert result["/test/auth.py"].llm_tool == "gemini" - - def test_parse_response_with_markdown(self): - """Test parsing response wrapped in markdown.""" - enhancer = LLMEnhancer() - response = '''```json -{ - "files": { - "/test/file.py": { - "summary": "Test file", - "keywords": ["test"], - "purpose": "test" - } - } -} -```''' - - result = enhancer._parse_response(response, "qwen") - - assert "/test/file.py" in result - assert result["/test/file.py"].summary == "Test file" - - def test_parse_response_multiple_files(self): - """Test parsing response with multiple files.""" - enhancer = LLMEnhancer() - response = json.dumps({ - "files": { - "/test/a.py": {"summary": "File A", "keywords": ["a"], "purpose": "util"}, - "/test/b.py": {"summary": "File B", "keywords": ["b"], "purpose": "api"}, - } - }) - - result = enhancer._parse_response(response, "gemini") - - assert len(result) == 2 - assert result["/test/a.py"].summary == "File A" - assert result["/test/b.py"].summary == "File B" - - def test_parse_invalid_json(self): - """Test parsing invalid JSON returns empty dict.""" - enhancer = LLMEnhancer() - response = "not valid json at all" - - result = enhancer._parse_response(response, "gemini") - - assert result == {} - - def test_parse_empty_response(self): - """Test parsing empty response returns empty dict.""" - enhancer = LLMEnhancer() - - result = enhancer._parse_response("", "gemini") - - assert result == {} - - -class TestJSONExtraction: - """Tests for JSON extraction from mixed text.""" - - def test_extract_json_from_plain(self): - """Test extracting JSON from plain text.""" - enhancer = LLMEnhancer() - text = '{"key": "value"}' - - result = enhancer._extract_json(text) - - assert result == '{"key": "value"}' - - def test_extract_json_from_markdown(self): - """Test extracting JSON from markdown code block.""" - enhancer = LLMEnhancer() - text = '''```json -{"key": "value"} -```''' - - result = enhancer._extract_json(text) - - assert result == '{"key": "value"}' - - def test_extract_json_with_surrounding_text(self): - """Test extracting JSON with surrounding text.""" - enhancer = LLMEnhancer() - text = 'Here is the result: {"key": "value"} That is all.' - - result = enhancer._extract_json(text) - - assert result == '{"key": "value"}' - - def test_extract_nested_json(self): - """Test extracting nested JSON.""" - enhancer = LLMEnhancer() - text = '{"outer": {"inner": "value"}}' - - result = enhancer._extract_json(text) - - assert '"outer"' in result - assert '"inner"' in result - - def test_extract_no_json(self): - """Test extracting from text without JSON.""" - enhancer = LLMEnhancer() - text = "No JSON here at all" - - result = enhancer._extract_json(text) - - assert result is None - - def test_extract_malformed_json(self): - """Test extracting malformed JSON returns None.""" - enhancer = LLMEnhancer() - text = '{"key": "value"' # Missing closing brace - - result = enhancer._extract_json(text) - - assert result is None - - -class TestEnhanceFiles: - """Tests for enhance_files method.""" - - @patch.object(LLMEnhancer, "check_available", return_value=False) - def test_enhance_files_ccw_not_available(self, mock_check): - """Test enhance_files returns empty when CCW not available.""" - enhancer = LLMEnhancer() - files = [FileData(path="/test/a.py", content="code", language="python")] - - result = enhancer.enhance_files(files) - - assert result == {} - - def test_enhance_files_disabled(self): - """Test enhance_files returns empty when disabled.""" - config = LLMConfig(enabled=False) - enhancer = LLMEnhancer(config) - files = [FileData(path="/test/a.py", content="code", language="python")] - - result = enhancer.enhance_files(files) - - assert result == {} - - @patch.object(LLMEnhancer, "check_available", return_value=True) - def test_enhance_files_empty_list(self, mock_check): - """Test enhance_files with empty list returns empty dict.""" - enhancer = LLMEnhancer() - - result = enhancer.enhance_files([]) - - assert result == {} - - @patch.object(LLMEnhancer, "check_available", return_value=True) - @patch.object(LLMEnhancer, "_invoke_ccw_cli") - def test_enhance_files_success(self, mock_invoke, mock_check): - """Test enhance_files successful processing.""" - mock_invoke.return_value = { - "success": True, - "stdout": json.dumps({ - "files": { - "/test/auth.py": { - "summary": "Auth module", - "keywords": ["auth"], - "purpose": "auth", - } - } - }), - "stderr": "", - "exit_code": 0, - } - - enhancer = LLMEnhancer() - files = [FileData(path="/test/auth.py", content="def login(): pass", language="python")] - - result = enhancer.enhance_files(files) - - assert "/test/auth.py" in result - assert result["/test/auth.py"].summary == "Auth module" - - @patch.object(LLMEnhancer, "check_available", return_value=True) - @patch.object(LLMEnhancer, "_invoke_ccw_cli") - def test_enhance_files_fallback(self, mock_invoke, mock_check): - """Test enhance_files falls back to secondary tool.""" - # First call fails, second succeeds - mock_invoke.side_effect = [ - {"success": False, "stdout": "", "stderr": "error", "exit_code": 1}, - { - "success": True, - "stdout": json.dumps({ - "files": { - "/test/file.py": { - "summary": "Fallback result", - "keywords": ["fallback"], - "purpose": "util", - } - } - }), - "stderr": "", - "exit_code": 0, - }, - ] - - enhancer = LLMEnhancer() - files = [FileData(path="/test/file.py", content="code", language="python")] - - result = enhancer.enhance_files(files) - - assert "/test/file.py" in result - assert result["/test/file.py"].summary == "Fallback result" - assert mock_invoke.call_count == 2 - - -class TestEnhanceFile: - """Tests for enhance_file single file method.""" - - @patch.object(LLMEnhancer, "enhance_files") - def test_enhance_file_success(self, mock_enhance_files): - """Test enhance_file returns metadata on success.""" - mock_enhance_files.return_value = { - "/test/auth.py": SemanticMetadata( - summary="Auth module", - keywords=["auth", "login"], - purpose="auth", - file_path="/test/auth.py", - llm_tool="gemini", - ) - } - - enhancer = LLMEnhancer() - result = enhancer.enhance_file("/test/auth.py", "def login(): pass", "python") - - assert result.summary == "Auth module" - assert result.keywords == ["auth", "login"] - - @patch.object(LLMEnhancer, "enhance_files") - def test_enhance_file_fallback_on_failure(self, mock_enhance_files): - """Test enhance_file returns default metadata on failure.""" - mock_enhance_files.return_value = {} # Enhancement failed - - enhancer = LLMEnhancer() - result = enhancer.enhance_file("/test/file.py", "code", "python") - - assert "python" in result.summary.lower() - assert "python" in result.keywords - assert result.purpose == "unknown" - - -class TestBatchProcessing: - """Tests for batch processing.""" - - @patch.object(LLMEnhancer, "check_available", return_value=True) - @patch.object(LLMEnhancer, "_process_batch") - def test_batch_processing(self, mock_process, mock_check): - """Test files are processed in batches.""" - mock_process.return_value = {} - - config = LLMConfig(batch_size=2) - enhancer = LLMEnhancer(config) - - files = [ - FileData(path=f"/test/file{i}.py", content="code", language="python") - for i in range(5) - ] - - enhancer.enhance_files(files) - - # 5 files with batch_size=2 should result in 3 batches - assert mock_process.call_count == 3 - - @patch.object(LLMEnhancer, "check_available", return_value=True) - @patch.object(LLMEnhancer, "_process_batch") - def test_batch_continues_on_error(self, mock_process, mock_check): - """Test batch processing continues on error.""" - # First batch fails, second succeeds - mock_process.side_effect = [ - Exception("Batch 1 failed"), - {"/test/file2.py": SemanticMetadata(summary="OK", keywords=[], purpose="")}, - ] - - config = LLMConfig(batch_size=1) - enhancer = LLMEnhancer(config) - - files = [ - FileData(path="/test/file1.py", content="code", language="python"), - FileData(path="/test/file2.py", content="code", language="python"), - ] - - result = enhancer.enhance_files(files) - - # Should still get results from second batch - assert "/test/file2.py" in result - - -# === CCW CLI Invocation Tests === - -class TestCCWInvocation: - """Tests for CCW CLI invocation.""" - - @patch("subprocess.run") - @patch("shutil.which", return_value="/usr/bin/ccw") - def test_invoke_success(self, mock_which, mock_run): - """Test successful CCW CLI invocation.""" - mock_run.return_value = MagicMock( - returncode=0, - stdout='{"files": {}}', - stderr="", - ) - - enhancer = LLMEnhancer() - result = enhancer._invoke_ccw_cli("test prompt", tool="gemini") - - assert result["success"] is True - assert result["exit_code"] == 0 - - @patch("subprocess.run") - @patch("shutil.which", return_value="/usr/bin/ccw") - def test_invoke_failure(self, mock_which, mock_run): - """Test failed CCW CLI invocation.""" - mock_run.return_value = MagicMock( - returncode=1, - stdout="", - stderr="Error occurred", - ) - - enhancer = LLMEnhancer() - result = enhancer._invoke_ccw_cli("test prompt", tool="gemini") - - assert result["success"] is False - assert result["exit_code"] == 1 - - @patch("subprocess.run") - @patch("shutil.which", return_value="/usr/bin/ccw") - def test_invoke_timeout(self, mock_which, mock_run): - """Test CCW CLI timeout handling.""" - import subprocess - mock_run.side_effect = subprocess.TimeoutExpired(cmd="ccw", timeout=300) - - enhancer = LLMEnhancer() - result = enhancer._invoke_ccw_cli("test prompt", tool="gemini") - - assert result["success"] is False - assert "timeout" in result["stderr"] - - @patch("subprocess.run") - @patch("shutil.which", return_value=None) - def test_invoke_ccw_not_found(self, mock_which, mock_run): - """Test CCW CLI not found handling.""" - mock_run.side_effect = FileNotFoundError() - - enhancer = LLMEnhancer() - result = enhancer._invoke_ccw_cli("test prompt", tool="gemini") - - assert result["success"] is False - assert "not found" in result["stderr"] - - -# === EnhancedSemanticIndexer Tests === - -class TestEnhancedSemanticIndexer: - """Tests for EnhancedSemanticIndexer integration.""" - - @pytest.fixture - def mock_enhancer(self): - """Create mock LLM enhancer.""" - enhancer = MagicMock(spec=LLMEnhancer) - enhancer.enhance_files.return_value = { - "/test/auth.py": SemanticMetadata( - summary="Authentication handler", - keywords=["auth", "login", "jwt"], - purpose="auth", - file_path="/test/auth.py", - llm_tool="gemini", - ) - } - return enhancer - - @pytest.fixture - def mock_embedder(self): - """Create mock embedder.""" - embedder = MagicMock() - embedder.embed.return_value = [[0.1] * 384] - embedder.embed_single.return_value = [0.1] * 384 - return embedder - - @pytest.fixture - def mock_vector_store(self): - """Create mock vector store.""" - store = MagicMock() - store.add_chunk.return_value = 1 - return store - - def test_index_files_empty_list(self, mock_enhancer, mock_embedder, mock_vector_store): - """Test indexing empty file list.""" - indexer = EnhancedSemanticIndexer(mock_enhancer, mock_embedder, mock_vector_store) - - result = indexer.index_files([]) - - assert result == 0 - mock_enhancer.enhance_files.assert_not_called() - - def test_index_files_with_llm_enhancement(self, mock_enhancer, mock_embedder, mock_vector_store): - """Test indexing with LLM enhancement.""" - indexer = EnhancedSemanticIndexer(mock_enhancer, mock_embedder, mock_vector_store) - files = [FileData(path="/test/auth.py", content="def login(): pass", language="python")] - - result = indexer.index_files(files) - - assert result == 1 - mock_enhancer.enhance_files.assert_called_once() - mock_embedder.embed.assert_called_once() - mock_vector_store.add_chunk.assert_called_once() - - def test_index_files_fallback_to_raw_code(self, mock_embedder, mock_vector_store): - """Test indexing falls back to raw code when LLM fails.""" - mock_enhancer = MagicMock(spec=LLMEnhancer) - mock_enhancer.enhance_files.return_value = {} # No enhancement - - indexer = EnhancedSemanticIndexer(mock_enhancer, mock_embedder, mock_vector_store) - files = [FileData(path="/test/file.py", content="code", language="python")] - - result = indexer.index_files(files) - - assert result == 1 - mock_embedder.embed_single.assert_called() - - def test_create_embeddable_text(self, mock_enhancer, mock_embedder, mock_vector_store): - """Test embeddable text creation.""" - indexer = EnhancedSemanticIndexer(mock_enhancer, mock_embedder, mock_vector_store) - - metadata = SemanticMetadata( - summary="Handles user authentication", - keywords=["auth", "login", "user"], - purpose="auth", - ) - file_data = FileData(path="/test/auth.py", content="code", language="python") - - text = indexer._create_embeddable_text(metadata, file_data) - - assert "Handles user authentication" in text - assert "auth" in text.lower() - assert "Keywords:" in text - assert "auth.py" in text - - -# === Factory Function Tests === - -class TestFactoryFunctions: - """Tests for factory functions.""" - - def test_create_enhancer_default(self): - """Test create_enhancer with defaults.""" - enhancer = create_enhancer() - - assert enhancer.config.tool == "gemini" - assert enhancer.config.enabled is True - - def test_create_enhancer_custom(self): - """Test create_enhancer with custom params.""" - enhancer = create_enhancer( - tool="qwen", - timeout_ms=600000, - batch_size=10, - enabled=False, - ) - - assert enhancer.config.tool == "qwen" - assert enhancer.config.timeout_ms == 600000 - assert enhancer.config.batch_size == 10 - assert enhancer.config.enabled is False - - @pytest.mark.skipif( - not pytest.importorskip("codexlens.semantic", reason="semantic not available"), - reason="Semantic dependencies not installed" - ) - def test_create_enhanced_indexer(self, tmp_path): - """Test create_enhanced_indexer factory.""" - try: - from codexlens.semantic import SEMANTIC_AVAILABLE - if not SEMANTIC_AVAILABLE: - pytest.skip("Semantic dependencies not installed") - - db_path = tmp_path / "semantic.db" - indexer = create_enhanced_indexer(db_path, llm_tool="gemini", llm_enabled=False) - - assert indexer.enhancer is not None - assert indexer.embedder is not None - assert indexer.vector_store is not None - except ImportError: - pytest.skip("Semantic dependencies not installed") - - -# === Edge Cases === - -class TestEdgeCases: - """Tests for edge cases.""" - - def test_semantic_metadata_with_special_chars(self): - """Test metadata with special characters.""" - metadata = SemanticMetadata( - summary='Test "quoted" and \'single\' quotes', - keywords=["special", "chars", "test's"], - purpose="test", - ) - assert '"quoted"' in metadata.summary - assert "test's" in metadata.keywords - - def test_file_data_with_unicode(self): - """Test FileData with unicode content.""" - data = FileData( - path="/test/中文.py", - content="def 你好(): return '世界'", - language="python", - ) - assert "中文" in data.path - assert "你好" in data.content - - @patch.object(LLMEnhancer, "check_available", return_value=True) - @patch.object(LLMEnhancer, "_invoke_ccw_cli") - def test_enhance_with_very_long_content(self, mock_invoke, mock_check): - """Test enhancement with very long content.""" - mock_invoke.return_value = { - "success": True, - "stdout": json.dumps({"files": {}}), - "stderr": "", - "exit_code": 0, - } - - config = LLMConfig(max_content_chars=100) - enhancer = LLMEnhancer(config) - - long_content = "x" * 10000 - files = [FileData(path="/test/long.py", content=long_content, language="python")] - - enhancer.enhance_files(files) - - # Should not crash, content should be truncated in prompt - mock_invoke.assert_called_once() - - def test_parse_response_with_missing_fields(self): - """Test parsing response with missing fields.""" - enhancer = LLMEnhancer() - response = json.dumps({ - "files": { - "/test/file.py": { - "summary": "Only summary provided", - # keywords and purpose missing - } - } - }) - - result = enhancer._parse_response(response, "gemini") - - assert "/test/file.py" in result - assert result["/test/file.py"].summary == "Only summary provided" - assert result["/test/file.py"].keywords == [] - assert result["/test/file.py"].purpose == "" - - -# === Chunk Boundary Refinement Tests === - -class TestRefineChunkBoundaries: - """Tests for refine_chunk_boundaries method.""" - - def test_refine_skips_docstring_chunks(self): - """Test that chunks with metadata type='docstring' pass through unchanged.""" - enhancer = LLMEnhancer() - - chunk = SemanticChunk( - content='"""This is a docstring."""\n' * 100, # Large docstring - embedding=None, - metadata={ - "chunk_type": "docstring", - "file": "/test/file.py", - "start_line": 1, - "end_line": 100, - } - ) - - result = enhancer.refine_chunk_boundaries(chunk, max_chunk_size=500) - - # Should return original chunk unchanged - assert len(result) == 1 - assert result[0] is chunk - - def test_refine_skips_small_chunks(self): - """Test that chunks under max_chunk_size pass through unchanged.""" - enhancer = LLMEnhancer() - - small_content = "def small_function():\n return 42" - chunk = SemanticChunk( - content=small_content, - embedding=None, - metadata={ - "chunk_type": "code", - "file": "/test/file.py", - "start_line": 1, - "end_line": 2, - } - ) - - result = enhancer.refine_chunk_boundaries(chunk, max_chunk_size=2000) - - # Small chunk should pass through unchanged - assert len(result) == 1 - assert result[0] is chunk - - @patch.object(LLMEnhancer, "check_available", return_value=True) - @patch.object(LLMEnhancer, "_invoke_ccw_cli") - def test_refine_splits_large_chunks(self, mock_invoke, mock_check): - """Test that chunks over threshold are split at LLM-suggested points.""" - mock_invoke.return_value = { - "success": True, - "stdout": json.dumps({ - "split_points": [ - {"line": 5, "reason": "end of first function"}, - {"line": 10, "reason": "end of second function"} - ] - }), - "stderr": "", - "exit_code": 0, - } - - enhancer = LLMEnhancer() - - # Create large chunk with clear line boundaries - lines = [] - for i in range(15): - lines.append(f"def func{i}():\n") - lines.append(f" return {i}\n") - - large_content = "".join(lines) - - chunk = SemanticChunk( - content=large_content, - embedding=None, - metadata={ - "chunk_type": "code", - "file": "/test/file.py", - "start_line": 1, - "end_line": 30, - } - ) - - result = enhancer.refine_chunk_boundaries(chunk, max_chunk_size=100) - - # Should split into multiple chunks - assert len(result) > 1 - # All chunks should have refined_by_llm metadata - assert all(c.metadata.get("refined_by_llm") is True for c in result) - # All chunks should preserve file metadata - assert all(c.metadata.get("file") == "/test/file.py" for c in result) - - @patch.object(LLMEnhancer, "check_available", return_value=True) - @patch.object(LLMEnhancer, "_invoke_ccw_cli") - def test_refine_handles_empty_split_points(self, mock_invoke, mock_check): - """Test graceful handling when LLM returns no split points.""" - mock_invoke.return_value = { - "success": True, - "stdout": json.dumps({"split_points": []}), - "stderr": "", - "exit_code": 0, - } - - enhancer = LLMEnhancer() - - large_content = "x" * 3000 - chunk = SemanticChunk( - content=large_content, - embedding=None, - metadata={ - "chunk_type": "code", - "file": "/test/file.py", - "start_line": 1, - "end_line": 1, - } - ) - - result = enhancer.refine_chunk_boundaries(chunk, max_chunk_size=1000) - - # Should return original chunk when no split points - assert len(result) == 1 - assert result[0].content == large_content - - def test_refine_disabled_returns_unchanged(self): - """Test that when config.enabled=False, refinement returns input unchanged.""" - config = LLMConfig(enabled=False) - enhancer = LLMEnhancer(config) - - large_content = "x" * 3000 - chunk = SemanticChunk( - content=large_content, - embedding=None, - metadata={ - "chunk_type": "code", - "file": "/test/file.py", - } - ) - - result = enhancer.refine_chunk_boundaries(chunk, max_chunk_size=1000) - - # Should return original chunk when disabled - assert len(result) == 1 - assert result[0] is chunk - - @patch.object(LLMEnhancer, "check_available", return_value=False) - def test_refine_ccw_unavailable_returns_unchanged(self, mock_check): - """Test that when CCW is unavailable, refinement returns input unchanged.""" - enhancer = LLMEnhancer() - - large_content = "x" * 3000 - chunk = SemanticChunk( - content=large_content, - embedding=None, - metadata={ - "chunk_type": "code", - "file": "/test/file.py", - } - ) - - result = enhancer.refine_chunk_boundaries(chunk, max_chunk_size=1000) - - # Should return original chunk when CCW unavailable - assert len(result) == 1 - assert result[0] is chunk - - @patch.object(LLMEnhancer, "check_available", return_value=True) - @patch.object(LLMEnhancer, "_invoke_ccw_cli") - def test_refine_fallback_on_primary_failure(self, mock_invoke, mock_check): - """Test that refinement falls back to secondary tool on primary failure.""" - # Primary fails, fallback succeeds - mock_invoke.side_effect = [ - {"success": False, "stdout": "", "stderr": "error", "exit_code": 1}, - { - "success": True, - "stdout": json.dumps({"split_points": [{"line": 5, "reason": "split"}]}), - "stderr": "", - "exit_code": 0, - }, - ] - - enhancer = LLMEnhancer() - - chunk = SemanticChunk( - content="def func():\n pass\n" * 100, - embedding=None, - metadata={ - "chunk_type": "code", - "file": "/test/file.py", - "start_line": 1, - "end_line": 200, - } - ) - - result = enhancer.refine_chunk_boundaries(chunk, max_chunk_size=100) - - # Should use fallback tool - assert mock_invoke.call_count == 2 - # Should successfully split - assert len(result) > 1 - - @patch.object(LLMEnhancer, "check_available", return_value=True) - @patch.object(LLMEnhancer, "_invoke_ccw_cli") - def test_refine_returns_original_on_error(self, mock_invoke, mock_check): - """Test that refinement returns original chunk on error.""" - mock_invoke.side_effect = Exception("Unexpected error") - - enhancer = LLMEnhancer() - - chunk = SemanticChunk( - content="x" * 3000, - embedding=None, - metadata={ - "chunk_type": "code", - "file": "/test/file.py", - } - ) - - result = enhancer.refine_chunk_boundaries(chunk, max_chunk_size=1000) - - # Should return original chunk on error - assert len(result) == 1 - assert result[0] is chunk - - -class TestParseSplitPoints: - """Tests for _parse_split_points helper method.""" - - def test_parse_valid_split_points(self): - """Test parsing valid split points from JSON response.""" - enhancer = LLMEnhancer() - - stdout = json.dumps({ - "split_points": [ - {"line": 5, "reason": "end of function"}, - {"line": 10, "reason": "class boundary"}, - {"line": 15, "reason": "method boundary"} - ] - }) - - result = enhancer._parse_split_points(stdout) - - assert result == [5, 10, 15] - - def test_parse_split_points_with_markdown(self): - """Test parsing split points wrapped in markdown.""" - enhancer = LLMEnhancer() - - stdout = '''```json -{ - "split_points": [ - {"line": 5, "reason": "split"}, - {"line": 10, "reason": "split"} - ] -} -```''' - - result = enhancer._parse_split_points(stdout) - - assert result == [5, 10] - - def test_parse_split_points_deduplicates(self): - """Test that duplicate line numbers are deduplicated.""" - enhancer = LLMEnhancer() - - stdout = json.dumps({ - "split_points": [ - {"line": 5, "reason": "split"}, - {"line": 5, "reason": "duplicate"}, - {"line": 10, "reason": "split"} - ] - }) - - result = enhancer._parse_split_points(stdout) - - assert result == [5, 10] - - def test_parse_split_points_sorts(self): - """Test that split points are sorted.""" - enhancer = LLMEnhancer() - - stdout = json.dumps({ - "split_points": [ - {"line": 15, "reason": "split"}, - {"line": 5, "reason": "split"}, - {"line": 10, "reason": "split"} - ] - }) - - result = enhancer._parse_split_points(stdout) - - assert result == [5, 10, 15] - - def test_parse_split_points_ignores_invalid(self): - """Test that invalid split points are ignored.""" - enhancer = LLMEnhancer() - - stdout = json.dumps({ - "split_points": [ - {"line": 5, "reason": "valid"}, - {"line": -1, "reason": "negative"}, - {"line": 0, "reason": "zero"}, - {"line": "not_a_number", "reason": "string"}, - {"reason": "missing line field"}, - 10 # Not a dict - ] - }) - - result = enhancer._parse_split_points(stdout) - - assert result == [5] - - def test_parse_split_points_empty_list(self): - """Test parsing empty split points list.""" - enhancer = LLMEnhancer() - - stdout = json.dumps({"split_points": []}) - - result = enhancer._parse_split_points(stdout) - - assert result == [] - - def test_parse_split_points_no_json(self): - """Test parsing when no JSON is found.""" - enhancer = LLMEnhancer() - - stdout = "No JSON here at all" - - result = enhancer._parse_split_points(stdout) - - assert result == [] - - def test_parse_split_points_invalid_json(self): - """Test parsing invalid JSON.""" - enhancer = LLMEnhancer() - - stdout = '{"split_points": [invalid json}' - - result = enhancer._parse_split_points(stdout) - - assert result == [] - - -class TestSplitChunkAtPoints: - """Tests for _split_chunk_at_points helper method.""" - - def test_split_chunk_at_points_correctness(self): - """Test that chunks are split correctly at specified line numbers.""" - enhancer = LLMEnhancer() - - # Create chunk with enough content per section to not be filtered (>50 chars each) - lines = [] - for i in range(1, 16): - lines.append(f"def function_number_{i}(): # This is function {i}\n") - lines.append(f" return value_{i}\n") - content = "".join(lines) # 30 lines total - - chunk = SemanticChunk( - content=content, - embedding=None, - metadata={ - "chunk_type": "code", - "file": "/test/file.py", - "start_line": 1, - "end_line": 30, - } - ) - - # Split at line indices 10 and 20 (boundaries will be [0, 10, 20, 30]) - split_points = [10, 20] - - result = enhancer._split_chunk_at_points(chunk, split_points) - - # Should create 3 chunks with sufficient content - assert len(result) == 3 - - # Verify they all have the refined metadata - assert all(c.metadata.get("refined_by_llm") is True for c in result) - assert all("original_chunk_size" in c.metadata for c in result) - - def test_split_chunk_preserves_metadata(self): - """Test that split chunks preserve original metadata.""" - enhancer = LLMEnhancer() - - # Create content with enough characters (>50) in each section - content = "# This is a longer line with enough content\n" * 5 - - chunk = SemanticChunk( - content=content, - embedding=None, - metadata={ - "chunk_type": "code", - "file": "/test/file.py", - "language": "python", - "start_line": 10, - "end_line": 15, - } - ) - - split_points = [2] # Split at line 2 - result = enhancer._split_chunk_at_points(chunk, split_points) - - # At least one chunk should be created - assert len(result) >= 1 - - for new_chunk in result: - assert new_chunk.metadata["chunk_type"] == "code" - assert new_chunk.metadata["file"] == "/test/file.py" - assert new_chunk.metadata["language"] == "python" - assert new_chunk.metadata.get("refined_by_llm") is True - assert "original_chunk_size" in new_chunk.metadata - - def test_split_chunk_skips_tiny_sections(self): - """Test that very small sections are skipped.""" - enhancer = LLMEnhancer() - - # Create content where middle section will be tiny - content = ( - "# Long line with lots of content to exceed 50 chars\n" * 3 + - "x\n" + # Tiny section - "# Another long line with lots of content here too\n" * 3 - ) - - chunk = SemanticChunk( - content=content, - embedding=None, - metadata={ - "chunk_type": "code", - "file": "/test/file.py", - "start_line": 1, - "end_line": 7, - } - ) - - # Split to create tiny middle section - split_points = [3, 4] - result = enhancer._split_chunk_at_points(chunk, split_points) - - # Tiny sections (< 50 chars stripped) should be filtered out - # Should have 2 chunks (first 3 lines and last 3 lines), middle filtered - assert all(len(c.content.strip()) >= 50 for c in result) - - def test_split_chunk_empty_split_points(self): - """Test splitting with empty split points list.""" - enhancer = LLMEnhancer() - - content = "# Content line\n" * 10 - chunk = SemanticChunk( - content=content, - embedding=None, - metadata={ - "chunk_type": "code", - "file": "/test/file.py", - "start_line": 1, - "end_line": 10, - } - ) - - result = enhancer._split_chunk_at_points(chunk, []) - - # Should return single chunk (original when content > 50 chars) - assert len(result) == 1 - - def test_split_chunk_sets_embedding_none(self): - """Test that split chunks have embedding set to None.""" - enhancer = LLMEnhancer() - - content = "# This is a longer line with enough content here\n" * 5 - chunk = SemanticChunk( - content=content, - embedding=[0.1] * 384, # Has embedding - metadata={ - "chunk_type": "code", - "file": "/test/file.py", - "start_line": 1, - "end_line": 5, - } - ) - - split_points = [2] - result = enhancer._split_chunk_at_points(chunk, split_points) - - # All split chunks should have None embedding (will be regenerated) - assert len(result) >= 1 - assert all(c.embedding is None for c in result) - - def test_split_chunk_returns_original_if_no_valid_chunks(self): - """Test that original chunk is returned if no valid chunks created.""" - enhancer = LLMEnhancer() - - # Very small content - content = "x" - chunk = SemanticChunk( - content=content, - embedding=None, - metadata={ - "chunk_type": "code", - "file": "/test/file.py", - "start_line": 1, - "end_line": 1, - } - ) - - # Split at invalid point - split_points = [1] - result = enhancer._split_chunk_at_points(chunk, split_points) - - # Should return original chunk when no valid splits - assert len(result) == 1 - assert result[0] is chunk