From b9b2932f50712fa926d8936f6767fdb60ea15ae4 Mon Sep 17 00:00:00 2001 From: catlog22 Date: Sun, 8 Feb 2026 21:54:42 +0800 Subject: [PATCH] Add tests and implement functionality for staged cascade search and LSP expansion - Introduced a new JSON file for verbose output of the Codex Lens search results. - Added unit tests for binary search functionality in `test_stage1_binary_search_uses_chunk_lines.py`. - Implemented regression tests for staged cascade Stage 2 expansion depth in `test_staged_cascade_lsp_depth.py`. - Created unit tests for staged cascade Stage 2 realtime LSP graph expansion in `test_staged_cascade_realtime_lsp.py`. - Enhanced the ChainSearchEngine to respect configuration settings for staged LSP depth and improve search accuracy. --- .../workspace/WorkspaceSelector.tsx | 103 +---- ccw/frontend/src/lib/nativeDialog.ts | 36 ++ ccw/frontend/src/lib/unsplash.ts | 2 +- ccw/frontend/src/locales/en/workspace.json | 7 +- ccw/frontend/src/locales/zh/workspace.json | 7 +- ccw/frontend/src/pages/SettingsPage.tsx | 162 ++----- ccw/src/core/routes/system-routes.ts | 113 +++++ ccw/src/tools/ask-question.ts | 25 +- codex-lens/_tmp_search.json | 415 ++++++++++++++++++ codex-lens/_tmp_search2.json | 415 ++++++++++++++++++ codex-lens/_tmp_verbose.json | 95 ++++ codex-lens/src/codexlens/cli/commands.py | 118 ++++- codex-lens/src/codexlens/config.py | 5 + codex-lens/src/codexlens/lsp/lsp_bridge.py | 15 +- .../src/codexlens/lsp/lsp_graph_builder.py | 11 +- .../src/codexlens/search/chain_search.py | 287 ++++++++++-- ...t_stage1_binary_search_uses_chunk_lines.py | 65 +++ .../tests/test_staged_cascade_lsp_depth.py | 168 +++++++ .../tests/test_staged_cascade_realtime_lsp.py | 98 +++++ .../tests/unit/lsp/test_lsp_edge_cases.py | 18 + 20 files changed, 1882 insertions(+), 283 deletions(-) create mode 100644 ccw/frontend/src/lib/nativeDialog.ts create mode 100644 codex-lens/_tmp_search.json create mode 100644 codex-lens/_tmp_search2.json create mode 100644 codex-lens/_tmp_verbose.json create mode 100644 codex-lens/tests/test_stage1_binary_search_uses_chunk_lines.py create mode 100644 codex-lens/tests/test_staged_cascade_lsp_depth.py create mode 100644 codex-lens/tests/test_staged_cascade_realtime_lsp.py diff --git a/ccw/frontend/src/components/workspace/WorkspaceSelector.tsx b/ccw/frontend/src/components/workspace/WorkspaceSelector.tsx index a45628ac..57eca14c 100644 --- a/ccw/frontend/src/components/workspace/WorkspaceSelector.tsx +++ b/ccw/frontend/src/components/workspace/WorkspaceSelector.tsx @@ -1,12 +1,13 @@ // ======================================== // Workspace Selector Component // ======================================== -// Dropdown for selecting recent workspaces with folder browser and manual path input +// Dropdown for selecting recent workspaces with native folder picker and manual path input -import { useState, useCallback, useRef } from 'react'; +import { useState, useCallback } from 'react'; import { ChevronDown, X, FolderOpen, Check } from 'lucide-react'; import { useIntl } from 'react-intl'; import { cn } from '@/lib/utils'; +import { selectFolder } from '@/lib/nativeDialog'; import { Button } from '@/components/ui/Button'; import { Input } from '@/components/ui/Input'; import { @@ -69,7 +70,7 @@ function truncatePath(path: string, maxChars: number = 40): string { * Workspace selector component * * Provides a dropdown menu for selecting from recent workspace paths, - * a manual path input dialog for entering custom paths, and delete buttons + * a native OS folder picker, a manual path input dialog, and delete buttons * for removing paths from recent history. * * @example @@ -86,15 +87,9 @@ export function WorkspaceSelector({ className }: WorkspaceSelectorProps) { // UI state const [isDropdownOpen, setIsDropdownOpen] = useState(false); - const [isBrowseOpen, setIsBrowseOpen] = useState(false); + const [isManualOpen, setIsManualOpen] = useState(false); const [manualPath, setManualPath] = useState(''); - // Hidden file input for folder selection - const folderInputRef = useRef(null); - - /** - * Handle path selection from dropdown - */ const handleSelectPath = useCallback( async (path: string) => { await switchWorkspace(path); @@ -103,77 +98,30 @@ export function WorkspaceSelector({ className }: WorkspaceSelectorProps) { [switchWorkspace] ); - /** - * Handle remove path from recent history - */ const handleRemovePath = useCallback( async (e: React.MouseEvent, path: string) => { - e.stopPropagation(); // Prevent triggering selection + e.stopPropagation(); await removeRecentPath(path); }, [removeRecentPath] ); - /** - * Handle open folder browser - trigger hidden file input click - */ - const handleBrowseFolder = useCallback(() => { + const handleBrowseFolder = useCallback(async () => { setIsDropdownOpen(false); - // Trigger the hidden file input click - folderInputRef.current?.click(); - }, []); + const selected = await selectFolder(projectPath || undefined); + if (selected) { + await switchWorkspace(selected); + } + }, [projectPath, switchWorkspace]); - /** - * Handle folder selection from file input - */ - const handleFolderSelect = useCallback( - async (e: React.ChangeEvent) => { - const files = e.target.files; - if (files && files.length > 0) { - // Get the path from the first file - const firstFile = files[0]; - // The webkitRelativePath contains the full path relative to the selected folder - // We need to get the parent directory path - const relativePath = firstFile.webkitRelativePath; - const folderPath = relativePath.substring(0, relativePath.indexOf('/')); - - // In browser environment, we can't get the full absolute path - // We need to ask the user to confirm or use the folder name - // For now, open the manual dialog with the folder name as hint - setManualPath(folderPath); - setIsBrowseOpen(true); - } - // Reset input value to allow selecting the same folder again - e.target.value = ''; - }, - [] - ); - - /** - * Handle manual path submission - */ const handleManualPathSubmit = useCallback(async () => { const trimmedPath = manualPath.trim(); - if (!trimmedPath) { - return; // TODO: Show validation error - } - + if (!trimmedPath) return; await switchWorkspace(trimmedPath); - setIsBrowseOpen(false); + setIsManualOpen(false); setManualPath(''); }, [manualPath, switchWorkspace]); - /** - * Handle dialog cancel - */ - const handleDialogCancel = useCallback(() => { - setIsBrowseOpen(false); - setManualPath(''); - }, []); - - /** - * Handle keyboard events in dialog input - */ const handleInputKeyDown = useCallback( (e: React.KeyboardEvent) => { if (e.key === 'Enter') { @@ -259,7 +207,7 @@ export function WorkspaceSelector({ className }: WorkspaceSelectorProps) { {recentPaths.length > 0 && } - {/* Browse button to open folder selector */} + {/* Browse button to open native folder selector */} { setIsDropdownOpen(false); - setIsBrowseOpen(true); + setIsManualOpen(true); }} className="cursor-pointer gap-2" > @@ -290,20 +238,8 @@ export function WorkspaceSelector({ className }: WorkspaceSelectorProps) { - {/* Hidden file input for folder selection */} - {/* eslint-disable-next-line @typescript-eslint/no-explicit-any */} - - {/* Manual path input dialog */} - + @@ -324,7 +260,10 @@ export function WorkspaceSelector({ className }: WorkspaceSelectorProps) { diff --git a/ccw/frontend/src/lib/nativeDialog.ts b/ccw/frontend/src/lib/nativeDialog.ts new file mode 100644 index 00000000..857067f2 --- /dev/null +++ b/ccw/frontend/src/lib/nativeDialog.ts @@ -0,0 +1,36 @@ +/** + * Native OS dialog helpers + * Calls server-side endpoints that open system-native file/folder picker dialogs. + */ + +export async function selectFolder(initialDir?: string): Promise { + try { + const res = await fetch('/api/dialog/select-folder', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ initialDir }), + }); + if (!res.ok) return null; + const data = await res.json(); + if (data.cancelled) return null; + return data.path || null; + } catch { + return null; + } +} + +export async function selectFile(initialDir?: string): Promise { + try { + const res = await fetch('/api/dialog/select-file', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ initialDir }), + }); + if (!res.ok) return null; + const data = await res.json(); + if (data.cancelled) return null; + return data.path || null; + } catch { + return null; + } +} diff --git a/ccw/frontend/src/lib/unsplash.ts b/ccw/frontend/src/lib/unsplash.ts index b35aaddb..79305759 100644 --- a/ccw/frontend/src/lib/unsplash.ts +++ b/ccw/frontend/src/lib/unsplash.ts @@ -59,7 +59,7 @@ export async function searchUnsplash( export async function uploadBackgroundImage(file: File): Promise<{ url: string; filename: string }> { const headers: Record = { 'Content-Type': file.type, - 'X-Filename': file.name, + 'X-Filename': encodeURIComponent(file.name), }; const csrfToken = getCsrfToken(); if (csrfToken) { diff --git a/ccw/frontend/src/locales/en/workspace.json b/ccw/frontend/src/locales/en/workspace.json index 2d8d7c81..8caaa57e 100644 --- a/ccw/frontend/src/locales/en/workspace.json +++ b/ccw/frontend/src/locales/en/workspace.json @@ -12,7 +12,12 @@ "dialog": { "title": "Select Project Folder", "placeholder": "Enter project path...", - "help": "The path to your project directory" + "help": "The path to your project directory", + "selectCurrent": "Select This Folder", + "parentDir": "Parent Directory", + "loading": "Loading...", + "emptyDir": "Empty directory", + "accessDenied": "Cannot access this directory" } }, "actions": { diff --git a/ccw/frontend/src/locales/zh/workspace.json b/ccw/frontend/src/locales/zh/workspace.json index ccd0f117..1ce1006e 100644 --- a/ccw/frontend/src/locales/zh/workspace.json +++ b/ccw/frontend/src/locales/zh/workspace.json @@ -12,7 +12,12 @@ "dialog": { "title": "选择项目文件夹", "placeholder": "输入项目路径...", - "help": "您的项目目录路径" + "help": "您的项目目录路径", + "selectCurrent": "选择此文件夹", + "parentDir": "上级目录", + "loading": "加载中...", + "emptyDir": "空目录", + "accessDenied": "无法访问此目录" } }, "actions": { diff --git a/ccw/frontend/src/pages/SettingsPage.tsx b/ccw/frontend/src/pages/SettingsPage.tsx index f0410322..58aeb74c 100644 --- a/ccw/frontend/src/pages/SettingsPage.tsx +++ b/ccw/frontend/src/pages/SettingsPage.tsx @@ -35,13 +35,6 @@ import { Card } from '@/components/ui/Card'; import { Button } from '@/components/ui/Button'; import { Input } from '@/components/ui/Input'; import { Badge } from '@/components/ui/Badge'; -import { - Dialog, - DialogContent, - DialogHeader, - DialogTitle, - DialogDescription, -} from '@/components/ui/Dialog'; import { ThemeSelector } from '@/components/shared/ThemeSelector'; import { useTheme } from '@/hooks'; import { toast } from 'sonner'; @@ -63,146 +56,43 @@ import { useUpgradeCcwInstallation, } from '@/hooks/useSystemSettings'; -// ========== File Path Input with Browse Dialog ========== - -interface BrowseItem { - name: string; - path: string; - isDirectory: boolean; - isFile: boolean; -} +// ========== File Path Input with Native File Picker ========== interface FilePathInputProps { value: string; onChange: (value: string) => void; placeholder: string; - showHidden?: boolean; } -function FilePathInput({ value, onChange, placeholder, showHidden = true }: FilePathInputProps) { - const [dialogOpen, setDialogOpen] = useState(false); - const [browseItems, setBrowseItems] = useState([]); - const [currentBrowsePath, setCurrentBrowsePath] = useState(''); - const [parentPath, setParentPath] = useState(''); - const [loading, setLoading] = useState(false); - - const browseDirectory = async (dirPath?: string) => { - setLoading(true); - try { - const res = await fetch('/api/dialog/browse', { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ path: dirPath || '~', showHidden }), - }); - if (!res.ok) return; - const data = await res.json(); - setBrowseItems(data.items || []); - setCurrentBrowsePath(data.currentPath || ''); - setParentPath(data.parentPath || ''); - } catch { - // silently fail - } finally { - setLoading(false); +function FilePathInput({ value, onChange, placeholder }: FilePathInputProps) { + const handleBrowse = async () => { + const { selectFile } = await import('@/lib/nativeDialog'); + const initialDir = value ? value.replace(/[/\\][^/\\]*$/, '') : undefined; + const selected = await selectFile(initialDir); + if (selected) { + onChange(selected); } }; - const handleOpen = () => { - setDialogOpen(true); - // If value is set, browse its parent directory; otherwise browse home - const startPath = value ? value.replace(/[/\\][^/\\]*$/, '') : undefined; - browseDirectory(startPath); - }; - - const handleSelectFile = (filePath: string) => { - onChange(filePath); - setDialogOpen(false); - }; - return ( - <> -
- onChange(e.target.value)} - placeholder={placeholder} - className="flex-1" - /> - -
- - - - - - - Browse Files - - - {currentBrowsePath} - - - -
- {loading ? ( -
- -
- ) : ( -
- {/* Parent directory */} - {parentPath && parentPath !== currentBrowsePath && ( - - )} - {browseItems.map((item) => ( - - ))} - {browseItems.length === 0 && ( -
- Empty directory -
- )} -
- )} -
-
-
- +
+ onChange(e.target.value)} + placeholder={placeholder} + className="flex-1" + /> + +
); } diff --git a/ccw/src/core/routes/system-routes.ts b/ccw/src/core/routes/system-routes.ts index b003d48d..db3faea2 100644 --- a/ccw/src/core/routes/system-routes.ts +++ b/ccw/src/core/routes/system-routes.ts @@ -439,6 +439,119 @@ export async function handleSystemRoutes(ctx: SystemRouteContext): Promise { + const { initialDir } = body as { initialDir?: string }; + const os = await import('os'); + const { execFile } = await import('child_process'); + const startDir = initialDir || os.homedir(); + + return new Promise>((resolve) => { + if (process.platform === 'win32') { + const script = `Add-Type -AssemblyName System.Windows.Forms; $d = New-Object System.Windows.Forms.FolderBrowserDialog; $d.SelectedPath = '${startDir.replace(/'/g, "''")}'; $d.ShowNewFolderButton = $true; if ($d.ShowDialog() -eq 'OK') { $d.SelectedPath }`; + execFile('powershell', ['-NoProfile', '-Command', script], + { timeout: 120000 }, + (err, stdout) => { + if (err || !stdout.trim()) { + resolve({ cancelled: true }); + } else { + resolve({ path: stdout.trim() }); + } + } + ); + } else if (process.platform === 'darwin') { + const escapedDir = startDir.replace(/"/g, '\\"'); + const script = `POSIX path of (choose folder with prompt "Select Project Folder" default location POSIX file "${escapedDir}")`; + execFile('osascript', ['-e', script], + { timeout: 120000 }, + (err, stdout) => { + if (err || !stdout.trim()) { + resolve({ cancelled: true }); + } else { + resolve({ path: stdout.trim().replace(/\/$/, '') }); + } + } + ); + } else { + // Linux: try zenity, fallback to kdialog + execFile('zenity', ['--file-selection', '--directory', '--title=Select Project Folder', `--filename=${startDir}/`], + { timeout: 120000 }, + (err, stdout) => { + if (err || !stdout.trim()) { + execFile('kdialog', ['--getexistingdirectory', startDir, '--title', 'Select Project Folder'], + { timeout: 120000 }, + (err2, stdout2) => { + resolve(err2 || !stdout2.trim() ? { cancelled: true } : { path: stdout2.trim() }); + } + ); + } else { + resolve({ path: stdout.trim() }); + } + } + ); + } + }); + }); + return true; + } + + // API: Native OS file selection dialog + if (pathname === '/api/dialog/select-file' && req.method === 'POST') { + handlePostRequest(req, res, async (body) => { + const { initialDir } = body as { initialDir?: string }; + const os = await import('os'); + const { execFile } = await import('child_process'); + const startDir = initialDir || os.homedir(); + + return new Promise>((resolve) => { + if (process.platform === 'win32') { + const script = `Add-Type -AssemblyName System.Windows.Forms; $d = New-Object System.Windows.Forms.OpenFileDialog; $d.InitialDirectory = '${startDir.replace(/'/g, "''")}'; if ($d.ShowDialog() -eq 'OK') { $d.FileName }`; + execFile('powershell', ['-NoProfile', '-Command', script], + { timeout: 120000 }, + (err, stdout) => { + if (err || !stdout.trim()) { + resolve({ cancelled: true }); + } else { + resolve({ path: stdout.trim() }); + } + } + ); + } else if (process.platform === 'darwin') { + const escapedDir = startDir.replace(/"/g, '\\"'); + const script = `POSIX path of (choose file with prompt "Select File" default location POSIX file "${escapedDir}")`; + execFile('osascript', ['-e', script], + { timeout: 120000 }, + (err, stdout) => { + if (err || !stdout.trim()) { + resolve({ cancelled: true }); + } else { + resolve({ path: stdout.trim() }); + } + } + ); + } else { + execFile('zenity', ['--file-selection', '--title=Select File', `--filename=${startDir}/`], + { timeout: 120000 }, + (err, stdout) => { + if (err || !stdout.trim()) { + execFile('kdialog', ['--getopenfilename', startDir, '--title', 'Select File'], + { timeout: 120000 }, + (err2, stdout2) => { + resolve(err2 || !stdout2.trim() ? { cancelled: true } : { path: stdout2.trim() }); + } + ); + } else { + resolve({ path: stdout.trim() }); + } + } + ); + } + }); + }); + return true; + } + // API: File dialog - list directory contents for file browser if (pathname === '/api/dialog/browse' && req.method === 'POST') { handlePostRequest(req, res, async (body) => { diff --git a/ccw/src/tools/ask-question.ts b/ccw/src/tools/ask-question.ts index a49f4a98..857c7ba8 100644 --- a/ccw/src/tools/ask-question.ts +++ b/ccw/src/tools/ask-question.ts @@ -551,15 +551,18 @@ export function handleMultiAnswer(compositeId: string, answers: QuestionAnswer[] * Automatically stops when the questionId is no longer in pendingQuestions (timeout cleanup). */ function startAnswerPolling(questionId: string, isComposite: boolean = false): void { - const path = `/api/a2ui/answer?questionId=${encodeURIComponent(questionId)}&composite=${isComposite}`; + const pollPath = `/api/a2ui/answer?questionId=${encodeURIComponent(questionId)}&composite=${isComposite}`; + + console.error(`[A2UI-Poll] Starting polling for questionId=${questionId}, composite=${isComposite}, port=${DASHBOARD_PORT}`); const poll = () => { // Stop if the question was already resolved or timed out if (!pendingQuestions.has(questionId)) { + console.error(`[A2UI-Poll] Stopping: questionId=${questionId} no longer pending`); return; } - const req = http.get({ hostname: 'localhost', port: DASHBOARD_PORT, path }, (res) => { + const req = http.get({ hostname: '127.0.0.1', port: DASHBOARD_PORT, path: pollPath }, (res) => { let data = ''; res.on('data', (chunk: Buffer) => { data += chunk.toString(); }); res.on('end', () => { @@ -571,23 +574,27 @@ function startAnswerPolling(questionId: string, isComposite: boolean = false): v return; } + console.error(`[A2UI-Poll] Answer received for questionId=${questionId}:`, JSON.stringify(parsed).slice(0, 200)); + if (isComposite && Array.isArray(parsed.answers)) { - handleMultiAnswer(questionId, parsed.answers as QuestionAnswer[]); + const ok = handleMultiAnswer(questionId, parsed.answers as QuestionAnswer[]); + console.error(`[A2UI-Poll] handleMultiAnswer result: ${ok}`); } else if (!isComposite && parsed.answer) { - handleAnswer(parsed.answer as QuestionAnswer); + const ok = handleAnswer(parsed.answer as QuestionAnswer); + console.error(`[A2UI-Poll] handleAnswer result: ${ok}`); } else { - // Unexpected shape, keep polling + console.error(`[A2UI-Poll] Unexpected response shape, keep polling`); setTimeout(poll, POLL_INTERVAL_MS); } - } catch { - // Parse error, keep polling + } catch (e) { + console.error(`[A2UI-Poll] Parse error:`, e); setTimeout(poll, POLL_INTERVAL_MS); } }); }); - req.on('error', () => { - // Network error (Dashboard not reachable), keep trying + req.on('error', (err) => { + console.error(`[A2UI-Poll] Network error: ${err.message}`); if (pendingQuestions.has(questionId)) { setTimeout(poll, POLL_INTERVAL_MS); } diff --git a/codex-lens/_tmp_search.json b/codex-lens/_tmp_search.json new file mode 100644 index 00000000..b056d0c7 --- /dev/null +++ b/codex-lens/_tmp_search.json @@ -0,0 +1,415 @@ +{ + "success": true, + "result": { + "query": "class Config", + "method": "cascade", + "count": 50, + "results": [ + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\hybrid_search\\data_structures.py", + "score": 0.06081658330145309, + "excerpt": " @classmethod\n def from_dict(cls, data: Dict[str, Any]) -> \"CallHierarchyItem\":\n return cls(\n name=data[\"name\"],\n kind=data[\"kind\"],\n file_path=data[\"file...", + "content": " @classmethod\n def from_dict(cls, data: Dict[str, Any]) -> \"CallHierarchyItem\":\n return cls(\n name=data[\"name\"],\n kind=data[\"kind\"],\n file_path=data[\"file_path\"],\n range=Range.from_dict(data[\"range\"]),\n detail=data.get(\"detail\"),\n )\n\n\n@dataclass\nclass CodeSymbolNode:\n\n id: str\n name: str\n kind: str\n file_path: str\n range: Range\n embedding: Optional[List[float]] = None\n raw_code: str = \"\"\n docstring: str = \"\"\n score: float = 0.0\n\n def __post_init__(self) -> None:\n if not self.id:\n raise ValueError(\"id cannot be empty\")\n if not self.name:\n raise ValueError(\"name cannot be empty\")\n if not self.kind:\n raise ValueError(\"kind cannot be empty\")\n if not self.file_path:\n raise ValueError(\"file_path cannot be empty\")\n\n def __hash__(self) -> int:\n return hash(self.id)\n\n def __eq__(self, other: object) -> bool:\n if not isinstance(other, CodeSymbolNode):\n return False\n return self.id == other.id\n\n def to_dict(self) -> Dict[str, Any]:\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", + "score": 0.056576452190618645, + "excerpt": "from rich.table import Table\n\nfrom codexlens.config import Config\nfrom codexlens.entities import IndexedFile, SearchResult, Symbol\nfrom codexlens.errors import CodexLensError, ConfigError, ParseError,...", + "content": "import os\nimport shutil\nimport sqlite3\nfrom pathlib import Path\nfrom typing import Annotated, Any, Dict, Iterable, List, Optional\n\nimport typer\nfrom rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn\nfrom rich.table import Table\n\nfrom codexlens.config import Config\nfrom codexlens.entities import IndexedFile, SearchResult, Symbol\nfrom codexlens.errors import CodexLensError, ConfigError, ParseError, StorageError, SearchError\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore, ProjectInfo\nfrom codexlens.storage.index_tree import IndexTreeBuilder\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.search.chain_search import ChainSearchEngine, SearchOptions\nfrom codexlens.watcher import WatcherManager, WatcherConfig\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\config.py", + "score": 0.05655744432847353, + "excerpt": "\"\"\"Configuration system for CodexLens.\"\"\"\n\nfrom __future__ import annotations", + "content": "\"\"\"Configuration system for CodexLens.\"\"\"\n\nfrom __future__ import annotations\n\nimport json\nimport logging\nimport os\nfrom dataclasses import dataclass, field\nfrom functools import cached_property\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", + "score": 0.049219375000264694, + "excerpt": "\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CH...", + "content": "\"\"\"Chain search engine for recursive multi-directory searching.\n\nProvides parallel search across directory hierarchies using indexed _index.db files.\nSupports depth-limited traversal, result aggregation, and symbol search.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CHECKING\nimport json\nimport logging\nimport os\nimport time\n\nfrom codexlens.entities import SearchResult, Symbol\n\nif TYPE_CHECKING:", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\embedding.py", + "score": 0.047931429239828446, + "excerpt": " def __init__(\n self,\n model_name: Optional[str] = None,\n use_gpu: bool = True,\n expand_dim: bool = True,\n ) -> None:\n from codexlens.semantic import SEMANTIC_...", + "content": " def __init__(\n self,\n model_name: Optional[str] = None,\n use_gpu: bool = True,\n expand_dim: bool = True,\n ) -> None:\n from codexlens.semantic import SEMANTIC_AVAILABLE\n\n if not SEMANTIC_AVAILABLE:\n raise ImportError(\n \"Semantic search dependencies not available. \"\n \"Install with: pip install codexlens[semantic]\"\n )\n\n self._model_name = model_name or self.DEFAULT_MODEL\n self._use_gpu = use_gpu\n self._expand_dim = expand_dim\n self._model = None\n self._native_dim: Optional[int] = None\n\n \n self._expansion_matrix: Optional[np.ndarray] = None\n\n @property\n def model_name(self) -> str:\n return self._model_name\n\n @property\n def embedding_dim(self) -> int:\n if self._expand_dim:\n return self.TARGET_DIM\n \n if self._native_dim is not None:\n return self._native_dim\n \n model_dims = {\n \"BAAI/bge-large-en-v1.5\": 1024,\n \"BAAI/bge-base-en-v1.5\": 768,\n \"BAAI/bge-small-en-v1.5\": 384,\n \"intfloat/multilingual-e5-large\": 1024,\n }\n return model_dims.get(self._model_name, 1024)\n\n @property\n def max_tokens(self) -> int:\n return 512 \n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", + "score": 0.04283104206542711, + "excerpt": "import threading\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom typing import Any, Dict, Iterable, List, Optional", + "content": "Provides intelligent load balancing across multiple LiteLLM embedding endpoints\nto maximize throughput while respecting rate limits.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport random\nimport threading\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom typing import Any, Dict, Iterable, List, Optional\n\nimport numpy as np\n\nfrom .base import BaseEmbedder\n\nlogger = logging.getLogger(__name__)\n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", + "score": 0.036886112765573215, + "excerpt": "- Direct subprocess spawning of language servers\n- JSON-RPC 2.0 communication over stdin/stdout\n- Multi-language support via configuration file (lsp-servers.json)\n- Process lifecycle management with a...", + "content": "\"\"\"Standalone Language Server Manager for direct LSP communication.\n\nThis module provides direct communication with language servers via JSON-RPC over stdio,\neliminating the need for VSCode Bridge. Similar to cclsp architecture.\n\nFeatures:\n- Direct subprocess spawning of language servers\n- JSON-RPC 2.0 communication over stdin/stdout\n- Multi-language support via configuration file (lsp-servers.json)\n- Process lifecycle management with auto-restart\n- Compatible interface with existing LspBridge\n\"\"\"\n\nfrom __future__ import annotations\n\nimport asyncio\nimport json\nimport logging\nimport os", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\models.py", + "score": 0.03457410829143062, + "excerpt": " container: Containing class/module (if any)\n score: Match score for ranking\n return {k: v for k, v in asdict(self).items() if v is not None}\n\n\n# =================================...", + "content": " container: Containing class/module (if any)\n score: Match score for ranking\n return {k: v for k, v in asdict(self).items() if v is not None}\n\n\n# =============================================================================\n# Section 4.4: find_references dataclasses\n# =============================================================================\n\n@dataclass\nclass ReferenceResult:\n file_path: str\n line: int\n column: int\n context_line: str\n relationship: str # call | import | type_annotation | inheritance\n\n def to_dict(self) -> dict:\n return asdict(self)\n\n\n@dataclass\nclass GroupedReferences:\n definition: DefinitionResult\n references: List[ReferenceResult] = field(default_factory=list)\n\n def to_dict(self) -> dict:\n return {\n \"definition\": self.definition.to_dict(),\n \"references\": [r.to_dict() for r in self.references],\n }\n\n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py", + "score": 0.03341093379138448, + "excerpt": "\n if TREE_SITTER_AVAILABLE:\n self._initialize_parser()\n\n def _initialize_parser(self) -> None:\n if TreeSitterParser is None or TreeSitterLanguage is None:\n retur...", + "content": "\n if TREE_SITTER_AVAILABLE:\n self._initialize_parser()\n\n def _initialize_parser(self) -> None:\n if TreeSitterParser is None or TreeSitterLanguage is None:\n return\n\n try:\n \n if self.language_id == \"python\":\n import tree_sitter_python\n self._language = TreeSitterLanguage(tree_sitter_python.language())\n elif self.language_id == \"javascript\":\n import tree_sitter_javascript\n self._language = TreeSitterLanguage(tree_sitter_javascript.language())\n elif self.language_id == \"typescript\":\n import tree_sitter_typescript\n \n if self.path is not None and self.path.suffix.lower() == \".tsx\":\n self._language = TreeSitterLanguage(tree_sitter_typescript.language_tsx())\n else:\n self._language = TreeSitterLanguage(tree_sitter_typescript.language_typescript())\n else:\n return\n\n \n self._parser = TreeSitterParser()\n if hasattr(self._parser, \"set_language\"):\n self._parser.set_language(self._language) \n else:\n self._parser.language = self._language \n\n except Exception:\n \n self._parser = None\n self._language = None\n\n def is_available(self) -> bool:\n return self._parser is not None and self._language is not None\n\n def _parse_tree(self, text: str) -> Optional[tuple[bytes, TreeSitterNode]]:\n if not self.is_available() or self._parser is None:\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py", + "score": 0.029568673189485736, + "excerpt": "\nimport logging\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import List, Optional", + "content": "\"\"\"Incremental indexer for processing file changes.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import List, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.storage.global_index import GlobalSymbolIndex\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py", + "score": 0.029334400167733504, + "excerpt": "\nfrom collections import defaultdict\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Literal\n", + "content": "\nUse cases:\n- Prioritize commonly called methods/functions\n- Filter out one-off results that may be less relevant\n- Deduplicate results pointing to the same symbol from different locations\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom collections import defaultdict\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Literal\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\n@dataclass", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\association_tree\\data_structures.py", + "score": 0.027925539288870704, + "excerpt": "\n def __len__(self) -> int:\n return len(self.all_nodes)\n\n def __repr__(self) -> str:\n return (\n f\"CallTree(roots={len(self.roots)}, nodes={len(self.all_nodes)}, \"\n ...", + "content": "\n def __len__(self) -> int:\n return len(self.all_nodes)\n\n def __repr__(self) -> str:\n return (\n f\"CallTree(roots={len(self.roots)}, nodes={len(self.all_nodes)}, \"\n f\"depth={self.depth_reached})\"\n )\n\n\n@dataclass\nclass UniqueNode:\n\n file_path: str\n name: str\n kind: str\n range: Range\n min_depth: int = 0\n occurrences: int = 1\n paths: List[List[str]] = field(default_factory=list)\n context_nodes: List[str] = field(default_factory=list)\n score: float = 0.0\n\n @property\n def node_key(self) -> tuple[str, int, int]:\n return (\n self.file_path,\n self.range.start_line,\n self.range.end_line,\n )\n\n def add_path(self, path: List[str]) -> None:\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", + "score": 0.024369821963687643, + "excerpt": "def _get_configured_index_root() -> Path:\n \"\"\"Get the index root from environment or config file.\n", + "content": "def _get_configured_index_root() -> Path:\n \"\"\"Get the index root from environment or config file.\n\n Priority order:\n 1. CODEXLENS_INDEX_DIR environment variable\n 2. index_dir from ~/.codexlens/config.json\n 3. Default: ~/.codexlens/indexes\n \"\"\"\n env_override = os.getenv(\"CODEXLENS_INDEX_DIR\")\n if env_override:\n return Path(env_override).expanduser().resolve()\n\n config_file = Path.home() / \".codexlens\" / \"config.json\"\n if config_file.exists():\n try:\n cfg = json.loads(config_file.read_text(encoding=\"utf-8\"))\n if \"index_dir\" in cfg:\n return Path(cfg[\"index_dir\"]).expanduser().resolve()\n except (json.JSONDecodeError, OSError):\n pass\n\n return Path.home() / \".codexlens\" / \"indexes\"", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", + "score": 0.023949795081967214, + "excerpt": "class VectorStore:\n \"\"\"SQLite-based vector storage with HNSW-accelerated similarity search.\n", + "content": "class VectorStore:\n \"\"\"SQLite-based vector storage with HNSW-accelerated similarity search.\n\n Performance optimizations:\n - HNSW index for O(log N) approximate nearest neighbor search\n - Embedding matrix cached in memory for batch similarity computation (fallback)\n - NumPy vectorized operations instead of Python loops (fallback)\n - Lazy content loading - only fetch full content for top-k results\n - Thread-safe cache invalidation\n - Bulk insert mode for efficient batch operations\n \"\"\"\n\n # Default embedding dimension (used when creating new index)\n DEFAULT_DIM = 768\n\n def __init__(self, db_path: str | Path) -> None:\n if not NUMPY_AVAILABLE:\n raise ImportError(\n \"Semantic search dependencies not available. \"\n \"Install with: pip install codexlens[semantic]\"\n )\n\n self.db_path = Path(db_path)\n self.db_path.parent.mkdir(parents=True, exist_ok=True)\n\n # Embedding cache for fast similarity search (fallback)\n self._cache_lock = threading.RLock()\n self._embedding_matrix: Optional[np.ndarray] = None\n self._embedding_norms: Optional[np.ndarray] = None\n self._chunk_ids: Optional[List[int]] = None\n self._cache_version: int = 0\n\n # ANN index for O(log N) search\n self._ann_index: Optional[ANNIndex] = None\n self._ann_dim: Optional[int] = None\n self._ann_write_lock = threading.Lock() # Protects ANN index modifications\n\n # Bulk insert mode tracking\n self._bulk_insert_mode: bool = False\n self._bulk_insert_ids: List[int] = []\n self._bulk_insert_embeddings: List[np.ndarray] = []\n\n self._init_schema()\n self._init_ann_index()\n\n def _init_schema(self) -> None:\n \"\"\"Initialize vector storage schema.\"\"\"\n with sqlite3.connect(self.db_path) as conn:\n # Enable memory mapping for faster reads\n conn.execute(\"PRAGMA mmap_size = 30000000000\") # 30GB limit\n conn.execute(\"\"\"\n CREATE TABLE IF NOT EXISTS semantic_chunks (\n id INTEGER PRIMARY KEY AUTOINCREMENT,\n file_path TEXT NOT NULL,\n content TEXT NOT NULL,\n embedding BLOB NOT NULL,\n metadata TEXT,\n category TEXT DEFAULT 'code',\n created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n )\n \"\"\")\n conn.execute(\"\"\"\n CREATE INDEX IF NOT EXISTS idx_chunks_file\n ON semantic_chunks(file_path)\n \"\"\")\n conn.execute(\"\"\"\n CREATE INDEX IF NOT EXISTS idx_chunks_category\n ON semantic_chunks(category)\n \"\"\")\n # Model configuration table - tracks which model generated the embeddings\n conn.execute(\"\"\"\n CREATE TABLE IF NOT EXISTS embeddings_config (\n id INTEGER PRIMARY KEY CHECK (id = 1),\n model_profile TEXT NOT NULL,\n model_name TEXT NOT NULL,\n embedding_dim INTEGER NOT NULL,\n backend TEXT NOT NULL DEFAULT 'fastembed',\n created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n )\n \"\"\")\n\n # Migration: Add backend column to existing tables\n self._migrate_backend_column(conn)\n # Migration: Add category column\n self._migrate_category_column(conn)\n\n conn.commit()\n\n def _migrate_backend_column(self, conn: sqlite3.Connection) -> None:\n \"\"\"Add backend column to existing embeddings_config table if not present.\n\n Args:\n conn: Active SQLite connection\n \"\"\"\n # Check if backend column exists\n cursor = conn.execute(\"PRAGMA table_info(embeddings_config)\")\n columns = [row[1] for row in cursor.fetchall()]\n\n if 'backend' not in columns:\n logger.info(\"Migrating embeddings_config table: adding backend column\")\n conn.execute(\"\"\"\n ALTER TABLE embeddings_config\n ADD COLUMN backend TEXT NOT NULL DEFAULT 'fastembed'\n \"\"\")\n\n def _migrate_category_column(self, conn: sqlite3.Connection) -> None:\n \"\"\"Add category column to existing semantic_chunks table if not present.\n\n Args:\n conn: Active SQLite connection\n \"\"\"\n # Check if category column exists\n cursor = conn.execute(\"PRAGMA table_info(semantic_chunks)\")\n columns = [row[1] for row in cursor.fetchall()]\n\n if 'category' not in columns:\n logger.info(\"Migrating semantic_chunks table: adding category column\")\n conn.execute(\"\"\"\n ALTER TABLE semantic_chunks\n ADD COLUMN category TEXT DEFAULT 'code'\n \"\"\")\n # Create index for fast category filtering\n conn.execute(\"\"\"\n CREATE INDEX IF NOT EXISTS idx_chunks_category\n ON semantic_chunks(category)\n \"\"\")\n\n def _init_ann_index(self) -> None:\n \"\"\"Initialize ANN index (lazy loading from existing data).\"\"\"\n if not HNSWLIB_AVAILABLE:\n logger.debug(\"hnswlib not available, using brute-force search\")\n return\n\n # Try to detect embedding dimension from existing data\n dim = self._detect_embedding_dim()\n if dim is None:\n # No data yet, will initialize on first add\n logger.debug(\"No embeddings found, ANN index will be created on first add\")\n return\n\n self._ann_dim = dim\n\n try:\n self._ann_index = ANNIndex(self.db_path, dim)\n if self._ann_index.load():\n logger.debug(\n \"Loaded ANN index with %d vectors\", self._ann_index.count()\n )\n else:\n # Index file doesn't exist, try to build from SQLite data\n logger.debug(\"ANN index file not found, rebuilding from SQLite\")\n self._rebuild_ann_index_internal()\n except Exception as e:\n logger.warning(\"Failed to initialize ANN index: %s\", e)\n self._ann_index = None\n\n def _detect_embedding_dim(self) -> Optional[int]:\n \"\"\"Detect embedding dimension from existing data.\"\"\"\n with sqlite3.connect(self.db_path) as conn:\n row = conn.execute(\n \"SELECT embedding FROM semantic_chunks LIMIT 1\"\n ).fetchone()\n if row and row[0]:\n # Embedding is stored as float32 blob\n blob = row[0]\n return len(blob) // np.dtype(np.float32).itemsize\n return None\n\n @property\n def dimension(self) -> Optional[int]:\n \"\"\"Return the dimension of embeddings in the store.\n\n Returns:\n Embedding dimension if available, None if store is empty.\n \"\"\"\n if self._ann_dim is not None:\n return self._ann_dim\n self._ann_dim = self._detect_embedding_dim()\n return self._ann_dim\n\n def _rebuild_ann_index_internal(self) -> int:\n \"\"\"Internal method to rebuild ANN index from SQLite data.\"\"\"\n if self._ann_index is None:\n return 0\n\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\"PRAGMA mmap_size = 30000000000\")\n rows = conn.execute(\n \"SELECT id, embedding FROM semantic_chunks\"\n ).fetchall()\n\n if not rows:\n return 0\n\n # Extract IDs and embeddings\n ids = [r[0] for r in rows]\n embeddings = np.vstack([\n np.frombuffer(r[1], dtype=np.float32) for r in rows\n ])\n\n # Add to ANN index\n self._ann_index.add_vectors(ids, embeddings)\n self._ann_index.save()\n\n logger.info(\"Rebuilt ANN index with %d vectors\", len(ids))\n return len(ids)\n\n def rebuild_ann_index(self) -> int:\n \"\"\"Rebuild HNSW index from all chunks in SQLite.\n\n Use this method to:\n - Migrate existing data to use ANN search\n - Repair corrupted index\n - Reclaim space after many deletions\n\n Returns:\n Number of vectors indexed.\n \"\"\"\n if not HNSWLIB_AVAILABLE:\n logger.warning(\"hnswlib not available, cannot rebuild ANN index\")\n return 0\n\n # Detect dimension\n dim = self._detect_embedding_dim()\n if dim is None:\n logger.warning(\"No embeddings found, cannot rebuild ANN index\")\n return 0\n\n self._ann_dim = dim\n\n # Create new index\n try:\n self._ann_index = ANNIndex(self.db_path, dim)\n return self._rebuild_ann_index_internal()\n except Exception as e:\n logger.error(\"Failed to rebuild ANN index: %s\", e)\n self._ann_index = None\n return 0\n\n def _invalidate_cache(self) -> None:\n \"\"\"Invalidate the embedding cache (thread-safe).\"\"\"\n with self._cache_lock:\n self._embedding_matrix = None\n self._embedding_norms = None\n self._chunk_ids = None\n self._cache_version += 1\n\n def _refresh_cache(self) -> bool:\n \"\"\"Load embeddings into numpy matrix for fast similarity search.\n\n Returns:\n True if cache was refreshed successfully, False if no data.\n \"\"\"\n with self._cache_lock:\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\"PRAGMA mmap_size = 30000000000\")\n rows = conn.execute(\n \"SELECT id, embedding FROM semantic_chunks\"\n ).fetchall()\n\n if not rows:\n self._embedding_matrix = None\n self._embedding_norms = None\n self._chunk_ids = None\n return False\n\n # Extract IDs and embeddings\n self._chunk_ids = [r[0] for r in rows]\n\n # Bulk convert binary blobs to numpy matrix\n embeddings = [\n np.frombuffer(r[1], dtype=np.float32) for r in rows\n ]\n self._embedding_matrix = np.vstack(embeddings)\n\n # Pre-compute norms for faster similarity calculation\n self._embedding_norms = np.linalg.norm(\n self._embedding_matrix, axis=1, keepdims=True\n )\n # Avoid division by zero\n self._embedding_norms = np.where(\n self._embedding_norms == 0, EPSILON, self._embedding_norms\n )\n\n return True\n\n def _ensure_ann_index(self, dim: int) -> bool:\n \"\"\"Ensure ANN index is initialized with correct dimension.\n\n This method is thread-safe and uses double-checked locking.\n\n Args:\n dim: Embedding dimension\n\n Returns:\n True if ANN index is ready, False otherwise\n \"\"\"\n if not HNSWLIB_AVAILABLE:\n return False\n\n # Fast path: index already initialized (no lock needed)\n if self._ann_index is not None:\n return True\n\n # Slow path: acquire lock for initialization\n with self._ann_write_lock:\n # Double-check after acquiring lock\n if self._ann_index is not None:\n return True\n\n try:\n self._ann_dim = dim\n self._ann_index = ANNIndex(self.db_path, dim)\n self._ann_index.load() # Try to load existing\n return True\n except Exception as e:\n logger.warning(\"Failed to initialize ANN index: %s\", e)\n self._ann_index = None\n return False\n\n def add_chunk(\n self, chunk: SemanticChunk, file_path: str, category: str = \"code\"\n ) -> int:\n \"\"\"Add a single chunk with its embedding.\n\n Args:\n chunk: SemanticChunk with embedding\n file_path: Path to the source file\n category: File category ('code' or 'doc'), default 'code'\n\n Returns:\n The inserted chunk ID.\n \"\"\"\n if chunk.embedding is None:\n raise ValueError(\"Chunk must have embedding before adding to store\")\n\n embedding_arr = np.array(chunk.embedding, dtype=np.float32)\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n\n with sqlite3.connect(self.db_path) as conn:\n cursor = conn.execute(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n (file_path, chunk.content, embedding_blob, metadata_json, category)\n )\n conn.commit()\n chunk_id = cursor.lastrowid or 0\n\n # Add to ANN index\n if self._ensure_ann_index(len(chunk.embedding)):\n with self._ann_write_lock:\n try:\n self._ann_index.add_vectors([chunk_id], embedding_arr.reshape(1, -1))\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return chunk_id\n\n def add_chunks(\n self, chunks: List[SemanticChunk], file_path: str, category: str = \"code\"\n ) -> List[int]:\n \"\"\"Add multiple chunks with embeddings (batch insert).\n\n Args:\n chunks: List of SemanticChunk objects with embeddings\n file_path: Path to the source file\n category: File category ('code' or 'doc'), default 'code'\n\n Returns:\n List of inserted chunk IDs.\n \"\"\"\n if not chunks:\n return []\n\n # Prepare batch data\n batch_data = []\n embeddings_list = []\n for chunk in chunks:\n if chunk.embedding is None:\n raise ValueError(\"All chunks must have embeddings\")\n embedding_arr = np.array(chunk.embedding, dtype=np.float32)\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))\n embeddings_list.append(embedding_arr)\n\n # Batch insert to SQLite\n with sqlite3.connect(self.db_path) as conn:\n # Get starting ID before insert\n row = conn.execute(\"SELECT MAX(id) FROM semantic_chunks\").fetchone()\n start_id = (row[0] or 0) + 1\n\n conn.executemany(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n batch_data\n )\n conn.commit()\n # Calculate inserted IDs based on starting ID\n ids = list(range(start_id, start_id + len(chunks)))\n\n # Add to ANN index\n if embeddings_list and self._ensure_ann_index(len(embeddings_list[0])):\n with self._ann_write_lock:\n try:\n embeddings_matrix = np.vstack(embeddings_list)\n self._ann_index.add_vectors(ids, embeddings_matrix)\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add batch to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return ids\n\n def add_chunks_batch(\n self,\n chunks_with_paths: List[Tuple[SemanticChunk, str]],\n update_ann: bool = True,\n auto_save_ann: bool = True,\n categories: Optional[List[str]] = None,\n ) -> List[int]:\n \"\"\"Batch insert chunks from multiple files in a single transaction.\n\n This method is optimized for bulk operations during index generation.\n\n Args:\n chunks_with_paths: List of (chunk, file_path) tuples\n update_ann: If True, update ANN index with new vectors (default: True)\n auto_save_ann: If True, save ANN index after update (default: True).\n Set to False for bulk inserts to reduce I/O overhead.\n categories: Optional list of categories per chunk. If None, defaults to 'code'.\n If provided, must match length of chunks_with_paths.\n\n Returns:\n List of inserted chunk IDs\n \"\"\"\n if not chunks_with_paths:\n return []\n\n batch_size = len(chunks_with_paths)\n\n # Validate categories if provided\n if categories is not None and len(categories) != batch_size:\n raise ValueError(\n f\"categories length ({len(categories)}) must match \"\n f\"chunks_with_paths length ({batch_size})\"\n )\n\n # Prepare batch data\n batch_data = []\n embeddings_list = []\n for i, (chunk, file_path) in enumerate(chunks_with_paths):\n if chunk.embedding is None:\n raise ValueError(\"All chunks must have embeddings\")\n # Optimize: avoid repeated np.array() if already numpy\n if isinstance(chunk.embedding, np.ndarray):\n embedding_arr = chunk.embedding.astype(np.float32)\n else:\n embedding_arr = np.array(chunk.embedding, dtype=np.float32)\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n category = categories[i] if categories else \"code\"\n batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))\n embeddings_list.append(embedding_arr)\n\n # Batch insert to SQLite in single transaction\n with sqlite3.connect(self.db_path) as conn:\n # Get starting ID before insert\n row = conn.execute(\"SELECT MAX(id) FROM semantic_chunks\").fetchone()\n start_id = (row[0] or 0) + 1\n\n _validate_chunk_id_range(start_id, batch_size)\n\n conn.executemany(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n batch_data\n )\n conn.commit()\n # Calculate inserted IDs based on starting ID\n ids = list(range(start_id, start_id + batch_size))\n\n # Handle ANN index updates\n if embeddings_list and update_ann and self._ensure_ann_index(len(embeddings_list[0])):\n with self._ann_write_lock:\n # In bulk insert mode, accumulate for later batch update\n if self._bulk_insert_mode:\n self._bulk_insert_ids.extend(ids)\n self._bulk_insert_embeddings.extend(embeddings_list)\n else:\n # Normal mode: update immediately\n try:\n embeddings_matrix = np.vstack(embeddings_list)\n self._ann_index.add_vectors(ids, embeddings_matrix)\n if auto_save_ann:\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add batch to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return ids\n\n def add_chunks_batch_numpy(\n self,\n chunks_with_paths: List[Tuple[SemanticChunk, str]],\n embeddings_matrix: np.ndarray,\n update_ann: bool = True,\n auto_save_ann: bool = True,\n categories: Optional[List[str]] = None,\n ) -> List[int]:\n \"\"\"Batch insert chunks with pre-computed numpy embeddings matrix.\n\n This method accepts embeddings as a numpy matrix to avoid list->array conversions.\n Useful when embeddings are already in numpy format from batch encoding.\n\n Args:\n chunks_with_paths: List of (chunk, file_path) tuples (embeddings can be None)\n embeddings_matrix: Pre-computed embeddings as (N, D) numpy array\n update_ann: If True, update ANN index with new vectors (default: True)\n auto_save_ann: If True, save ANN index after update (default: True)\n categories: Optional list of categories per chunk. If None, defaults to 'code'.\n\n Returns:\n List of inserted chunk IDs\n \"\"\"\n if not chunks_with_paths:\n return []\n\n batch_size = len(chunks_with_paths)\n\n if len(chunks_with_paths) != embeddings_matrix.shape[0]:\n raise ValueError(\n f\"Mismatch: {len(chunks_with_paths)} chunks but \"\n f\"{embeddings_matrix.shape[0]} embeddings\"\n )\n\n # Validate categories if provided\n if categories is not None and len(categories) != batch_size:\n raise ValueError(\n f\"categories length ({len(categories)}) must match \"\n f\"chunks_with_paths length ({batch_size})\"\n )\n\n # Ensure float32 format\n embeddings_matrix = embeddings_matrix.astype(np.float32)\n\n # Prepare batch data\n batch_data = []\n for i, (chunk, file_path) in enumerate(chunks_with_paths):\n embedding_arr = embeddings_matrix[i]\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n category = categories[i] if categories else \"code\"\n batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))\n\n # Batch insert to SQLite in single transaction\n with sqlite3.connect(self.db_path) as conn:\n # Get starting ID before insert\n row = conn.execute(\"SELECT MAX(id) FROM semantic_chunks\").fetchone()\n start_id = (row[0] or 0) + 1\n\n _validate_chunk_id_range(start_id, batch_size)\n\n conn.executemany(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n batch_data\n )\n conn.commit()\n # Calculate inserted IDs based on starting ID\n ids = list(range(start_id, start_id + batch_size))\n\n # Handle ANN index updates\n if update_ann and self._ensure_ann_index(embeddings_matrix.shape[1]):\n with self._ann_write_lock:\n # In bulk insert mode, accumulate for later batch update\n if self._bulk_insert_mode:\n self._bulk_insert_ids.extend(ids)\n # Split matrix into individual arrays for accumulation\n self._bulk_insert_embeddings.extend([embeddings_matrix[i] for i in range(len(ids))])\n else:\n # Normal mode: update immediately\n try:\n self._ann_index.add_vectors(ids, embeddings_matrix)\n if auto_save_ann:\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add batch to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return ids\n\n def begin_bulk_insert(self) -> None:\n \"\"\"Begin bulk insert mode - disable ANN auto-update for better performance.\n\n Usage:\n store.begin_bulk_insert()\n try:\n for batch in batches:\n store.add_chunks_batch(batch, auto_save_ann=False)\n finally:\n store.end_bulk_insert()\n\n Or use context manager:\n with store.bulk_insert():\n for batch in batches:\n store.add_chunks_batch(batch)\n \"\"\"\n with self._ann_write_lock:\n self._bulk_insert_mode = True\n self._bulk_insert_ids.clear()\n self._bulk_insert_embeddings.clear()\n logger.debug(\"Entered bulk insert mode\")\n\n def end_bulk_insert(self) -> None:\n \"\"\"End bulk insert mode and rebuild ANN index from accumulated data.\n\n This method should be called after all bulk inserts are complete to\n update the ANN index in a single batch operation.\n \"\"\"\n with self._ann_write_lock:\n if not self._bulk_insert_mode:\n logger.warning(\"end_bulk_insert called but not in bulk insert mode\")\n return\n\n self._bulk_insert_mode = False\n bulk_ids = list(self._bulk_insert_ids)\n bulk_embeddings = list(self._bulk_insert_embeddings)\n self._bulk_insert_ids.clear()\n self._bulk_insert_embeddings.clear()\n\n # Update ANN index with accumulated data.\n if bulk_ids and bulk_embeddings:\n if self._ensure_ann_index(len(bulk_embeddings[0])):\n with self._ann_write_lock:\n try:\n embeddings_matrix = np.vstack(bulk_embeddings)\n self._ann_index.add_vectors(bulk_ids, embeddings_matrix)\n self._ann_index.save()\n logger.info(\n \"Bulk insert complete: added %d vectors to ANN index\",\n len(bulk_ids),\n )\n except Exception as e:\n logger.error(\"Failed to update ANN index after bulk insert: %s\", e)\n\n logger.debug(\"Exited bulk insert mode\")\n\n class BulkInsertContext:\n \"\"\"Context manager for bulk insert operations.\"\"\"\n\n def __init__(self, store: \"VectorStore\") -> None:\n self.store = store\n\n def __enter__(self) -> \"VectorStore\":\n self.store.begin_bulk_insert()\n return self.store\n\n def __exit__(self, exc_type, exc_val, exc_tb) -> None:\n self.store.end_bulk_insert()\n\n def bulk_insert(self) -> \"VectorStore.BulkInsertContext\":\n \"\"\"Return a context manager for bulk insert operations.\n\n Usage:\n with store.bulk_insert():\n for batch in batches:\n store.add_chunks_batch(batch)\n \"\"\"\n return self.BulkInsertContext(self)\n\n def delete_file_chunks(self, file_path: str) -> int:\n \"\"\"Delete all chunks for a file.\n\n Returns:\n Number of deleted chunks.\n \"\"\"\n # Get chunk IDs before deletion (for ANN index)\n chunk_ids_to_delete = []\n if self._ann_index is not None:\n with sqlite3.connect(self.db_path) as conn:\n rows = conn.execute(\n \"SELECT id FROM semantic_chunks WHERE file_path = ?\",\n (file_path,)\n ).fetchall()\n chunk_ids_to_delete = [r[0] for r in rows]\n\n # Delete from SQLite\n with sqlite3.connect(self.db_path) as conn:\n cursor = conn.execute(\n \"DELETE FROM semantic_chunks WHERE file_path = ?\",\n (file_path,)\n )\n conn.commit()\n deleted = cursor.rowcount\n\n # Remove from ANN index\n if deleted > 0 and self._ann_index is not None and chunk_ids_to_delete:\n with self._ann_write_lock:\n try:\n self._ann_index.remove_vectors(chunk_ids_to_delete)\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to remove from ANN index: %s\", e)\n\n if deleted > 0:\n self._invalidate_cache()\n return deleted\n\n def search_similar(\n self,\n query_embedding: List[float],\n top_k: int = 10,\n min_score: float = 0.0,\n return_full_content: bool = True,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Find chunks most similar to query embedding.\n\n Uses HNSW index for O(log N) search when available, falls back to\n brute-force NumPy search otherwise.\n\n Args:\n query_embedding: Query vector.\n top_k: Maximum results to return.\n min_score: Minimum cosine similarity score in [0.0, 1.0].\n return_full_content: If True, return full code block content.\n category: Optional category filter ('code' or 'doc'). If None, returns all.\n\n Returns:\n List of SearchResult ordered by similarity (highest first).\n \"\"\"\n query_vec = np.array(query_embedding, dtype=np.float32)\n\n if not 0.0 <= min_score <= 1.0:\n raise ValueError(\n f\"Invalid min_score: {min_score}. Must be within [0.0, 1.0] for cosine similarity.\"\n )\n\n # Try HNSW search first (O(log N))\n if (\n HNSWLIB_AVAILABLE\n and self._ann_index is not None\n and self._ann_index.is_loaded\n and self._ann_index.count() > 0\n ):\n try:\n return self._search_with_ann(\n query_vec, top_k, min_score, return_full_content, category\n )\n except Exception as e:\n logger.warning(\"ANN search failed, falling back to brute-force: %s\", e)\n\n # Fallback to brute-force search (O(N))\n return self._search_brute_force(\n query_vec, top_k, min_score, return_full_content, category\n )\n\n def _search_with_ann(\n self,\n query_vec: np.ndarray,\n top_k: int,\n min_score: float,\n return_full_content: bool,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Search using HNSW index (O(log N)).\n\n Args:\n query_vec: Query vector as numpy array\n top_k: Maximum results to return\n min_score: Minimum cosine similarity score in [0.0, 1.0]\n return_full_content: If True, return full code block content\n category: Optional category filter ('code' or 'doc')\n\n Returns:\n List of SearchResult ordered by similarity (highest first)\n \"\"\"\n # Limit top_k to available vectors to prevent hnswlib error\n ann_count = self._ann_index.count()\n # When category filtering, fetch more candidates to compensate for filtering\n fetch_k = top_k * 3 if category else top_k\n effective_top_k = min(fetch_k, ann_count) if ann_count > 0 else 0\n\n if effective_top_k == 0:\n return []\n\n # HNSW search returns (ids, distances)\n # For cosine space: distance = 1 - similarity\n ids, distances = self._ann_index.search(query_vec, effective_top_k)\n\n if ids is None or distances is None:\n logger.debug(\n \"ANN search returned null results (ids=%s, distances=%s)\",\n ids,\n distances,\n )\n return []\n\n if len(ids) == 0 or len(distances) == 0:\n logger.debug(\n \"ANN search returned empty results (ids=%s, distances=%s)\",\n ids,\n distances,\n )\n return []\n\n if len(ids) != len(distances):\n logger.warning(\n \"ANN search returned mismatched result lengths (%d ids, %d distances)\",\n len(ids),\n len(distances),\n )\n return []\n\n # Convert distances to similarity scores\n scores = [1.0 - d for d in distances]\n\n # Filter by min_score\n filtered = [\n (chunk_id, score)\n for chunk_id, score in zip(ids, scores)\n if score >= min_score\n ]\n\n if not filtered:\n return []\n\n top_ids = [f[0] for f in filtered]\n top_scores = [f[1] for f in filtered]\n\n # Fetch content from SQLite with category filtering\n results = self._fetch_results_by_ids(\n top_ids, top_scores, return_full_content, category\n )\n # Apply final limit after category filtering\n return results[:top_k]\n\n def _search_brute_force(\n self,\n query_vec: np.ndarray,\n top_k: int,\n min_score: float,\n return_full_content: bool,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Brute-force search using NumPy (O(N) fallback).\n\n Args:\n query_vec: Query vector as numpy array\n top_k: Maximum results to return\n min_score: Minimum cosine similarity score in [0.0, 1.0]\n return_full_content: If True, return full code block content\n category: Optional category filter ('code' or 'doc')\n\n Returns:\n List of SearchResult ordered by similarity (highest first)\n \"\"\"\n logger.warning(\n \"Using brute-force vector search (hnswlib not available). \"\n \"This may cause high memory usage for large indexes. \"\n \"Install hnswlib for better performance: pip install hnswlib\"\n )\n\n with self._cache_lock:\n # Refresh cache if needed\n if self._embedding_matrix is None:\n if not self._refresh_cache():\n return [] # No data\n\n # Vectorized cosine similarity\n query_vec = query_vec.reshape(1, -1)\n query_norm = np.linalg.norm(query_vec)\n if query_norm == 0:\n return []\n\n # Compute all similarities at once: (N,) scores\n # similarity = (A @ B.T) / (||A|| * ||B||)\n dot_products = np.dot(self._embedding_matrix, query_vec.T).flatten()\n scores = dot_products / (self._embedding_norms.flatten() * query_norm)\n\n # Filter by min_score and get top-k indices\n valid_mask = scores >= min_score\n valid_indices = np.where(valid_mask)[0]\n\n if len(valid_indices) == 0:\n return []\n\n # When category filtering, fetch more candidates to compensate for filtering\n fetch_k = top_k * 3 if category else top_k\n\n # Sort by score descending and take top candidates\n valid_scores = scores[valid_indices]\n sorted_order = np.argsort(valid_scores)[::-1][:fetch_k]\n top_indices = valid_indices[sorted_order]\n top_scores = valid_scores[sorted_order]\n\n # Get chunk IDs for top results\n top_ids = [self._chunk_ids[i] for i in top_indices]\n\n # Fetch content only for top-k results (lazy loading) with category filtering\n results = self._fetch_results_by_ids(\n top_ids, top_scores.tolist(), return_full_content, category\n )\n # Apply final limit after category filtering\n return results[:top_k]\n\n def _fetch_results_by_ids(\n self,\n chunk_ids: List[int],\n scores: List[float],\n return_full_content: bool,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Fetch full result data for specific chunk IDs.\n\n Args:\n chunk_ids: List of chunk IDs to fetch.\n scores: Corresponding similarity scores.\n return_full_content: Whether to include full content.\n category: Optional category filter ('code' or 'doc').\n\n Returns:\n List of SearchResult objects.\n \"\"\"\n if not chunk_ids:\n return []\n\n # Build parameterized query for IN clause\n placeholders = \",\".join(\"?\" * len(chunk_ids))\n _validate_sql_placeholders(placeholders, len(chunk_ids))\n\n # SQL injection prevention:\n # - Only a validated placeholders string (commas + '?') is interpolated into the query.\n # - User-provided values are passed separately via sqlite3 parameters.\n # - Category filter is added as a separate parameter\n if category:\n query = \"\"\"\n SELECT id, file_path, content, metadata\n FROM semantic_chunks\n WHERE id IN ({placeholders}) AND category = ?\n \"\"\".format(placeholders=placeholders)\n params = list(chunk_ids) + [category]\n else:\n query = \"\"\"\n SELECT id, file_path, content, metadata\n FROM semantic_chunks\n WHERE id IN ({placeholders})\n \"\"\".format(placeholders=placeholders)\n params = chunk_ids\n\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\"PRAGMA mmap_size = 30000000000\")\n rows = conn.execute(query, params).fetchall()\n\n # Build ID -> row mapping\n id_to_row = {r[0]: r for r in rows}\n\n results = []\n for chunk_id, score in zip(chunk_ids, scores):\n row = id_to_row.get(chunk_id)\n if not row:\n continue\n\n _, file_path, content, metadata_json = row\n metadata = json.loads(metadata_json) if metadata_json else {}\n\n # Build excerpt (short preview)\n excerpt = content[:200] + \"...\" if len(content) > 200 else content\n\n # Extract symbol information from metadata\n symbol_name = metadata.get(\"symbol_name\")\n symbol_kind = metadata.get(\"symbol_kind\")\n start_line = metadata.get(\"start_line\")\n end_line = metadata.get(\"end_line\")\n\n # Build Symbol object if we have symbol info\n symbol = None\n if symbol_name and symbol_kind and start_line and end_line:\n try:\n from codexlens.entities import Symbol\n symbol = Symbol(\n name=symbol_name,\n kind=symbol_kind,\n range=(start_line, end_line)\n )\n except Exception:\n pass\n\n results.append(SearchResult(\n path=file_path,\n score=score,\n excerpt=excerpt,\n content=content if return_full_content else None,\n symbol=symbol,\n metadata=metadata,\n start_line=start_line,\n end_line=end_line,\n symbol_name=symbol_name,\n symbol_kind=symbol_kind,\n ))\n\n return results\n\n def count_chunks(self) -> int:\n \"\"\"Count total chunks in store.\"\"\"\n with sqlite3.connect(self.db_path) as conn:\n row = conn.execute(\"SELECT COUNT(*) FROM semantic_chunks\").fetchone()\n return row[0] if row else 0\n\n def get_all_chunks(self) -> List[SemanticChunk]:\n \"\"\"Get all chunks from the store.\n\n Returns:\n List of SemanticChunk objects with id and content.\n \"\"\"\n with sqlite3.connect(self.db_path) as conn:\n conn.row_factory = sqlite3.Row\n rows = conn.execute(\n \"SELECT id, file_path, content, metadata FROM semantic_chunks\"\n ).fetchall()\n\n chunks = []\n for row in rows:\n chunks.append(SemanticChunk(\n id=row[\"id\"],\n content=row[\"content\"],\n file_path=row[\"file_path\"],\n metadata=json.loads(row[\"metadata\"]) if row[\"metadata\"] else None,\n ))\n return chunks\n\n def clear_cache(self) -> None:\n \"\"\"Manually clear the embedding cache.\"\"\"\n self._invalidate_cache()\n\n @property\n def ann_available(self) -> bool:\n \"\"\"Check if ANN index is available and ready.\"\"\"\n return (\n HNSWLIB_AVAILABLE\n and self._ann_index is not None\n and self._ann_index.is_loaded\n )\n\n @property\n def ann_count(self) -> int:\n \"\"\"Get number of vectors in ANN index.\"\"\"\n if self._ann_index is not None:\n return self._ann_index.count()\n return 0\n\n def get_model_config(self) -> Optional[Dict[str, Any]]:\n \"\"\"Get the model configuration used for embeddings in this store.\n\n Returns:\n Dictionary with model_profile, model_name, embedding_dim, backend, or None if not set.\n \"\"\"\n with sqlite3.connect(self.db_path) as conn:\n row = conn.execute(\n \"SELECT model_profile, model_name, embedding_dim, backend, created_at, updated_at \"\n \"FROM embeddings_config WHERE id = 1\"\n ).fetchone()\n if row:\n return {\n \"model_profile\": row[0],\n \"model_name\": row[1],\n \"embedding_dim\": row[2],\n \"backend\": row[3],\n \"created_at\": row[4],\n \"updated_at\": row[5],\n }\n return None\n\n def set_model_config(\n self, model_profile: str, model_name: str, embedding_dim: int, backend: str = 'fastembed'\n ) -> None:\n \"\"\"Set the model configuration for embeddings in this store.\n\n This should be called when generating new embeddings. If a different\n model was previously used, this will update the configuration.\n\n Args:\n model_profile: Model profile name (fast, code, minilm, etc.)\n model_name: Full model name (e.g., jinaai/jina-embeddings-v2-base-code)\n embedding_dim: Embedding dimension (e.g., 768)\n backend: Backend used for embeddings (fastembed or litellm, default: fastembed)\n \"\"\"\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\n \"\"\"\n INSERT INTO embeddings_config (id, model_profile, model_name, embedding_dim, backend)\n VALUES (1, ?, ?, ?, ?)\n ON CONFLICT(id) DO UPDATE SET\n model_profile = excluded.model_profile,\n model_name = excluded.model_name,\n embedding_dim = excluded.embedding_dim,\n backend = excluded.backend,\n updated_at = CURRENT_TIMESTAMP\n \"\"\",\n (model_profile, model_name, embedding_dim, backend)\n )\n conn.commit()\n\n def check_model_compatibility(\n self, model_profile: str, model_name: str, embedding_dim: int\n ) -> Tuple[bool, Optional[str]]:\n \"\"\"Check if the given model is compatible with existing embeddings.\n\n Args:\n model_profile: Model profile to check\n model_name: Model name to check\n embedding_dim: Embedding dimension to check\n\n Returns:\n Tuple of (is_compatible, warning_message).\n is_compatible is True if no existing config or configs match.\n warning_message is a user-friendly message if incompatible.\n \"\"\"\n existing = self.get_model_config()\n if existing is None:\n return True, None\n\n # Check dimension first (most critical)\n if existing[\"embedding_dim\"] != embedding_dim:\n return False, (\n f\"Dimension mismatch: existing embeddings use {existing['embedding_dim']}d \"\n f\"({existing['model_profile']}), but requested model uses {embedding_dim}d \"\n f\"({model_profile}). Use --force to regenerate all embeddings.\"\n )\n\n # Check model (different models with same dimension may have different semantic spaces)\n if existing[\"model_profile\"] != model_profile:\n return False, (\n f\"Model mismatch: existing embeddings use '{existing['model_profile']}' \"\n f\"({existing['model_name']}), but requested '{model_profile}' \"\n f\"({model_name}). Use --force to regenerate all embeddings.\"\n )\n\n return True, None\n\n def close(self) -> None:\n \"\"\"Close the vector store and release resources.\n\n This ensures SQLite connections are closed and ANN index is cleared,\n allowing temporary files to be deleted on Windows.\n \"\"\"\n with self._cache_lock:\n self._embedding_matrix = None\n self._embedding_norms = None\n self._chunk_ids = None\n\n with self._ann_write_lock:\n self._ann_index = None\n\n def __enter__(self) -> \"VectorStore\":\n \"\"\"Context manager entry.\"\"\"\n return self\n\n def __exit__(self, exc_type, exc_val, exc_tb) -> None:\n \"\"\"Context manager exit - close resources.\"\"\"\n self.close()", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py", + "score": 0.02356190140431283, + "excerpt": "\"\"\"Base classes for clustering strategies in the hybrid search pipeline.\n\nThis module defines the abstract base class for clustering strategies used", + "content": "\"\"\"Base classes for clustering strategies in the hybrid search pipeline.\n\nThis module defines the abstract base class for clustering strategies used\nin the staged hybrid search pipeline. Strategies cluster search results\nbased on their embeddings and select representative results from each cluster.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom dataclasses import dataclass, field", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", + "score": 0.022717150737751757, + "excerpt": "\nimport re\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Protocol", + "content": "\"\"\"Parser factory for CodexLens.\n\nPython and JavaScript/TypeScript parsing use Tree-Sitter grammars when\navailable. Regex fallbacks are retained to preserve the existing parser\ninterface and behavior in minimal environments.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport re\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Protocol\n\nfrom codexlens.config import Config\nfrom codexlens.entities import CodeRelationship, IndexedFile, RelationshipType, Symbol\nfrom codexlens.parsers.treesitter_parser import TreeSitterSymbolParser\n\n\nclass Parser(Protocol):\n def parse(self, text: str, path: Path) -> IndexedFile: ...", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py", + "score": 0.022282698690396483, + "excerpt": "\"\"\"codex-lens LSP Server implementation using pygls.\n\nThis module provides the main Language Server class and entry point.\n\"\"\"\n", + "content": "\"\"\"codex-lens LSP Server implementation using pygls.\n\nThis module provides the main Language Server class and entry point.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport argparse\nimport logging\nimport sys\nfrom pathlib import Path\nfrom typing import Optional\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", + "score": 0.022258499170812605, + "excerpt": " logger.log(level, \"[TIMING] %s: %.2fms\", name, elapsed_ms)\n\nfrom codexlens.config import Config\nfrom codexlens.config import VECTORS_HNSW_NAME\nfrom codexlens.entities import SearchResult", + "content": " logger: Logger instance to use\n level: Logging level (default DEBUG)\n \"\"\"\n start = time.perf_counter()\n try:\n yield\n finally:\n elapsed_ms = (time.perf_counter() - start) * 1000\n logger.log(level, \"[TIMING] %s: %.2fms\", name, elapsed_ms)\n\nfrom codexlens.config import Config\nfrom codexlens.config import VECTORS_HNSW_NAME\nfrom codexlens.entities import SearchResult\nfrom codexlens.search.ranking import (\n DEFAULT_WEIGHTS,\n FTS_FALLBACK_WEIGHTS,\n QueryIntent,\n apply_symbol_boost,\n cross_encoder_rerank,\n detect_query_intent,\n filter_results_by_category,", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", + "score": 0.022204010428648113, + "excerpt": "import threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple", + "content": "\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport hashlib\nimport re\nimport sqlite3\nimport threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple\n\nfrom codexlens.config import Config\nfrom codexlens.entities import CodeRelationship, SearchResult, Symbol\nfrom codexlens.errors import StorageError\nfrom codexlens.storage.global_index import GlobalSymbolIndex\n\n\n@dataclass", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", + "score": 0.022191896701700627, + "excerpt": "from typing import Callable, List, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore", + "content": "from __future__ import annotations\n\nimport json\nimport logging\nimport signal\nimport threading\nimport time\nfrom pathlib import Path\nfrom typing import Callable, List, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore\n\nfrom .events import FileEvent, IndexResult, PendingQueueStatus, WatcherConfig, WatcherStats\nfrom .file_watcher import FileWatcher\nfrom .incremental_indexer import IncrementalIndexer\n\nlogger = logging.getLogger(__name__)\n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", + "score": 0.021943278996721462, + "excerpt": "\nThis module provides the semantic_search() function for combining\nvector, structural, and keyword search with configurable fusion strategies.\n\"\"\"\n", + "content": "\"\"\"Semantic search API with RRF fusion.\n\nThis module provides the semantic_search() function for combining\nvector, structural, and keyword search with configurable fusion strategies.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import List, Optional\n\nfrom .models import SemanticResult\nfrom .utils import resolve_project", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py", + "score": 0.021943278996721462, + "excerpt": "from watchdog.events import FileSystemEventHandler\n\nfrom .events import ChangeType, FileEvent, WatcherConfig, PendingQueueStatus\nfrom ..config import Config\n", + "content": "\nimport logging\nimport threading\nimport time\nfrom pathlib import Path\nfrom typing import Callable, Dict, List, Optional\n\nfrom watchdog.observers import Observer\nfrom watchdog.events import FileSystemEventHandler\n\nfrom .events import ChangeType, FileEvent, WatcherConfig, PendingQueueStatus\nfrom ..config import Config\n\nlogger = logging.getLogger(__name__)\n\n# Maximum queue size to prevent unbounded memory growth\n# When exceeded, forces immediate flush to avoid memory exhaustion\nMAX_QUEUE_SIZE = 50000\n\n\nclass _CodexLensHandler(FileSystemEventHandler):", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py", + "score": 0.02150910700179165, + "excerpt": "\n \n env_api_base = _get_env_with_fallback(\"RERANKER_API_BASE\", self._workspace_root)\n self.api_base = (api_base or env_api_base or defaults[\"api_base\"]).strip().rstrip(\"/\")\n ...", + "content": "\n \n env_api_base = _get_env_with_fallback(\"RERANKER_API_BASE\", self._workspace_root)\n self.api_base = (api_base or env_api_base or defaults[\"api_base\"]).strip().rstrip(\"/\")\n self.endpoint = defaults[\"endpoint\"]\n\n \n env_model = _get_env_with_fallback(\"RERANKER_MODEL\", self._workspace_root)\n self.model_name = (model_name or env_model or defaults[\"default_model\"]).strip()\n if not self.model_name:\n raise ValueError(\"model_name cannot be blank\")\n\n \n resolved_key = api_key or _get_env_with_fallback(env_api_key, self._workspace_root) or \"\"\n resolved_key = resolved_key.strip()\n if not resolved_key:\n raise ValueError(\n f\"Missing API key for reranker provider '{self.provider}'. \"\n f\"Pass api_key=... or set ${env_api_key}.\"\n )\n self._api_key = resolved_key\n\n self.timeout_s = float(timeout) if timeout and float(timeout) > 0 else 30.0\n self.max_retries = int(max_retries) if max_retries and int(max_retries) >= 0 else 3\n self.backoff_base_s = float(backoff_base_s) if backoff_base_s and float(backoff_base_s) > 0 else 0.5\n self.backoff_max_s = float(backoff_max_s) if backoff_max_s and float(backoff_max_s) > 0 else 8.0\n\n headers = {\n \"Authorization\": f\"Bearer {self._api_key}\",\n \"Content-Type\": \"application/json\",\n }\n if self.provider == \"cohere\":\n headers.setdefault(\"Cohere-Version\", \"2022-12-06\")\n\n self._client = httpx.Client(\n base_url=self.api_base,\n headers=headers,\n timeout=self.timeout_s,\n )\n\n \n if max_input_tokens is not None:\n self._max_input_tokens = max_input_tokens\n else:\n \n model_lower = self.model_name.lower()\n if '8b' in model_lower or 'large' in model_lower:\n self._max_input_tokens = 32768\n else:\n self._max_input_tokens = 8192\n\n @property\n def max_input_tokens(self) -> int:\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", + "score": 0.02051605801605802, + "excerpt": " locked_config = get_locked_model_config()\n\n if locked_config is None:\n return {\n \"is_locked\": False,\n \"has_conflict\": False,\n \"locked_config\": None,\n ...", + "content": " locked_config = get_locked_model_config()\n\n if locked_config is None:\n return {\n \"is_locked\": False,\n \"has_conflict\": False,\n \"locked_config\": None,\n \"target_config\": {\"backend\": target_backend, \"model\": target_model},\n }\n\n has_conflict = (\n locked_config[\"backend\"] != target_backend or\n locked_config[\"model\"] != target_model\n )\n\n return {\n \"is_locked\": True,\n \"has_conflict\": has_conflict,\n \"locked_config\": locked_config,\n \"target_config\": {\"backend\": target_backend, \"model\": target_model},\n }", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\mcp\\provider.py", + "score": 0.020229904287875303, + "excerpt": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Optional, List, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import (\n MCPContext,\n SymbolInfo,\n Re...", + "content": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Optional, List, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import (\n MCPContext,\n SymbolInfo,\n ReferenceInfo,\n RelatedSymbol,\n)\n\nif TYPE_CHECKING:\n from codexlens.storage.global_index import GlobalSymbolIndex\n from codexlens.storage.registry import RegistryStore\n from codexlens.search.chain_search import ChainSearchEngine\n\nlogger = logging.getLogger(__name__)\n\n\nclass MCPProvider:\n\n def __init__(\n self,\n global_index: \"GlobalSymbolIndex\",\n search_engine: \"ChainSearchEngine\",\n registry: \"RegistryStore\",\n ) -> None:\n self.global_index = global_index\n self.search_engine = search_engine\n self.registry = registry\n\n def build_context(\n self,\n symbol_name: str,\n context_type: str = \"symbol_explanation\",\n include_references: bool = True,\n include_related: bool = True,\n max_references: int = 10,\n ) -> Optional[MCPContext]:\n \n symbols = self.global_index.search(symbol_name, prefix_mode=False, limit=1)\n\n if not symbols:\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\mcp\\hooks.py", + "score": 0.020007053720837744, + "excerpt": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Any, Dict, Optional, Callable, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import MCPContext\n\nif TYPE_CHECK...", + "content": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Any, Dict, Optional, Callable, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import MCPContext\n\nif TYPE_CHECKING:\n from codexlens.mcp.provider import MCPProvider\n\nlogger = logging.getLogger(__name__)\n\n\nclass HookManager:\n\n def __init__(self, mcp_provider: \"MCPProvider\") -> None:\n self.mcp_provider = mcp_provider\n self._pre_hooks: Dict[str, Callable] = {}\n self._post_hooks: Dict[str, Callable] = {}\n\n \n self._register_default_hooks()\n\n def _register_default_hooks(self) -> None:\n self._pre_hooks[\"explain\"] = self._pre_explain_hook\n self._pre_hooks[\"refactor\"] = self._pre_refactor_hook\n self._pre_hooks[\"document\"] = self._pre_document_hook\n\n def execute_pre_hook(\n self,\n action: str,\n params: Dict[str, Any],\n ) -> Optional[MCPContext]:\n hook = self._pre_hooks.get(action)\n\n if not hook:\n logger.debug(f\"No pre-hook for action: {action}\")\n return None\n\n try:\n return hook(params)\n except Exception as e:\n logger.error(f\"Pre-hook failed for {action}: {e}\")\n return None\n\n def execute_post_hook(\n self,\n action: str,\n result: Any,\n ) -> None:\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\association_tree\\deduplicator.py", + "score": 0.019921615989390927, + "excerpt": "\nfrom __future__ import annotations\n\nimport logging\nfrom typing import Dict, List, Optional\n\nfrom .data_structures import (\n CallTree,\n TreeNode,\n UniqueNode,\n)\n\nlogger = logging.getLogger(__...", + "content": "\nfrom __future__ import annotations\n\nimport logging\nfrom typing import Dict, List, Optional\n\nfrom .data_structures import (\n CallTree,\n TreeNode,\n UniqueNode,\n)\n\nlogger = logging.getLogger(__name__)\n\n\n\nKIND_WEIGHTS: Dict[str, float] = {\n \n \"function\": 1.0,\n \"method\": 1.0,\n \"12\": 1.0, \n \"6\": 1.0, \n \n \"class\": 0.8,\n \"5\": 0.8, \n \n \"interface\": 0.7,\n \"11\": 0.7, \n \"type\": 0.6,\n \n \"constructor\": 0.9,\n \"9\": 0.9, \n \n \"variable\": 0.4,\n \"13\": 0.4, \n \"constant\": 0.5,\n \"14\": 0.5, \n \n \"unknown\": 0.3,\n}\n\n\nclass ResultDeduplicator:\n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\factory.py", + "score": 0.01962803701934137, + "excerpt": " if not ok:\n raise ImportError(err)\n\n from .api_reranker import APIReranker\n\n _ = device \n resolved_model_name = (model_name or \"\").strip() or None\n retu...", + "content": " if not ok:\n raise ImportError(err)\n\n from .api_reranker import APIReranker\n\n _ = device \n resolved_model_name = (model_name or \"\").strip() or None\n return APIReranker(model_name=resolved_model_name, **kwargs)\n\n raise ValueError(\n f\"Unknown backend: {backend}. Supported backends: 'fastembed', 'onnx', 'api', 'litellm', 'legacy'\"\n )\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", + "score": 0.015740967294674172, + "excerpt": "import time\nfrom concurrent.futures import ProcessPoolExecutor, as_completed\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set, Tuple", + "content": "\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport os\nimport re\nimport sqlite3\nimport time\nfrom concurrent.futures import ProcessPoolExecutor, as_completed\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set, Tuple\n\nfrom codexlens.config import Config\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.storage.global_index import GlobalSymbolIndex\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import ProjectInfo, RegistryStore\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", + "score": 0.01569458021070924, + "excerpt": "\nLightweight Mode:\n The ChunkConfig supports a `skip_token_count` option for performance optimization.\n When enabled, token counting uses a fast character-based estimation (char/4)\n instead o...", + "content": "\"\"\"Code chunking strategies for semantic search.\n\nThis module provides various chunking strategies for breaking down source code\ninto semantic chunks suitable for embedding and search.\n\nLightweight Mode:\n The ChunkConfig supports a `skip_token_count` option for performance optimization.\n When enabled, token counting uses a fast character-based estimation (char/4)\n instead of expensive tiktoken encoding.\n\n Use cases for lightweight mode:\n - Large-scale indexing where speed is critical\n - Scenarios where approximate token counts are acceptable\n - Memory-constrained environments\n - Initial prototyping and development\n\n Example:", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py", + "score": 0.015496521189120809, + "excerpt": "from typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:", + "content": "\"\"\"No-op clustering strategy for search results.\n\nNoOpStrategy returns all results ungrouped when clustering dependencies\nare not available or clustering is disabled.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\nclass NoOpStrategy(BaseClusteringStrategy):\n \"\"\"No-op clustering strategy that returns all results ungrouped.\n\n This strategy is used as a final fallback when no clustering dependencies", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py", + "score": 0.014896214896214899, + "excerpt": "from typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:", + "content": "\"\"\"DBSCAN-based clustering strategy for search results.\n\nDBSCAN (Density-Based Spatial Clustering of Applications with Noise)\nis the fallback clustering strategy when HDBSCAN is not available.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\nclass DBSCANStrategy(BaseClusteringStrategy):\n \"\"\"DBSCAN-based clustering strategy.\n\n Uses sklearn's DBSCAN algorithm as a fallback when HDBSCAN is not available.", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\mcp\\schema.py", + "score": 0.014112903225806453, + "excerpt": " definition: Optional[str] = None\n references: List[ReferenceInfo] = field(default_factory=list)\n related_symbols: List[RelatedSymbol] = field(default_factory=list)\n metadata: dict = field...", + "content": " definition: Optional[str] = None\n references: List[ReferenceInfo] = field(default_factory=list)\n related_symbols: List[RelatedSymbol] = field(default_factory=list)\n metadata: dict = field(default_factory=dict)\n\n def to_dict(self) -> dict:\n result = {\n \"version\": self.version,\n \"context_type\": self.context_type,\n \"metadata\": self.metadata,\n }\n\n if self.symbol:\n result[\"symbol\"] = self.symbol.to_dict()\n if self.definition:\n result[\"definition\"] = self.definition\n if self.references:\n result[\"references\"] = [r.to_dict() for r in self.references]\n if self.related_symbols:\n result[\"related_symbols\"] = [s.to_dict() for s in self.related_symbols]\n\n return result\n\n def to_json(self, indent: int = 2) -> str:\n return json.dumps(self.to_dict(), indent=indent)\n\n def to_prompt_injection(self) -> str:\n parts = [\"\"]\n\n if self.symbol:\n parts.append(f\"## Symbol: {self.symbol.name}\")\n parts.append(f\"Type: {self.symbol.kind}\")\n parts.append(f\"Location: {self.symbol.file_path}:{self.symbol.line_start}\")\n\n if self.definition:\n parts.append(\"\\n## Definition\")\n parts.append(f\"```\\n{self.definition}\\n```\")\n\n if self.references:\n parts.append(f\"\\n## References ({len(self.references)} found)\")\n for ref in self.references[:5]: \n parts.append(f\"- {ref.file_path}:{ref.line} ({ref.relationship_type})\")\n parts.append(f\" ```\\n {ref.context}\\n ```\")\n\n if self.related_symbols:\n parts.append(\"\\n## Related Symbols\")\n for sym in self.related_symbols[:10]: \n parts.append(f\"- {sym.name} ({sym.relationship})\")\n\n parts.append(\"\")\n return \"\\n\".join(parts)\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", + "score": 0.013999118165784833, + "excerpt": "\nclass LspBridge:\n \n DEFAULT_BRIDGE_URL = \"http://127.0.0.1:3457\"\n DEFAULT_TIMEOUT = 30.0 \n DEFAULT_CACHE_TTL = 300 \n DEFAULT_MAX_CACHE_SIZE = 1000 \n\n def __init__(\n self,\n...", + "content": "\nclass LspBridge:\n \n DEFAULT_BRIDGE_URL = \"http://127.0.0.1:3457\"\n DEFAULT_TIMEOUT = 30.0 \n DEFAULT_CACHE_TTL = 300 \n DEFAULT_MAX_CACHE_SIZE = 1000 \n\n def __init__(\n self,\n bridge_url: str = DEFAULT_BRIDGE_URL,\n timeout: float = DEFAULT_TIMEOUT,\n cache_ttl: int = DEFAULT_CACHE_TTL,\n max_cache_size: int = DEFAULT_MAX_CACHE_SIZE,\n use_vscode_bridge: bool = False,\n workspace_root: Optional[str] = None,\n config_file: Optional[str] = None,\n ):\n self.bridge_url = bridge_url\n self.timeout = timeout\n self.cache_ttl = cache_ttl\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\events.py", + "score": 0.013999118165784833, + "excerpt": "\nfrom __future__ import annotations\n\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom pathlib import Path\nfrom typing import List, Optional, Set\n\n\nclass ChangeType(Enum)...", + "content": "\nfrom __future__ import annotations\n\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom pathlib import Path\nfrom typing import List, Optional, Set\n\n\nclass ChangeType(Enum):\n CREATED = \"created\"\n MODIFIED = \"modified\"\n DELETED = \"deleted\"\n MOVED = \"moved\"\n\n\n@dataclass\nclass FileEvent:\n path: Path\n change_type: ChangeType\n timestamp: float\n old_path: Optional[Path] = None \n\n\n@dataclass\nclass WatcherConfig:\n debounce_ms: int = 60000 \n ignored_patterns: Set[str] = field(default_factory=lambda: {\n \n \".git\", \".svn\", \".hg\",\n \n \".venv\", \"venv\", \"env\", \"__pycache__\", \".pytest_cache\", \".mypy_cache\", \".ruff_cache\",\n \n \"node_modules\", \"bower_components\", \".npm\", \".yarn\",\n \n \"dist\", \"build\", \"out\", \"target\", \"bin\", \"obj\", \"_build\", \"coverage\", \"htmlcov\",\n \n \".idea\", \".vscode\", \".vs\", \".eclipse\",\n \n \".codexlens\",\n \n \".cache\", \".parcel-cache\", \".turbo\", \".next\", \".nuxt\",\n \n \"logs\", \"tmp\", \"temp\",\n })\n languages: Optional[List[str]] = None \n\n\n@dataclass\nclass PendingQueueStatus:\n file_count: int = 0\n files: List[str] = field(default_factory=list) \n countdown_seconds: int = 0\n last_event_time: Optional[float] = None\n\n\n@dataclass\nclass IndexResult:\n files_indexed: int = 0\n files_removed: int = 0\n symbols_added: int = 0\n symbols_removed: int = 0\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", + "score": 0.013902465515368743, + "excerpt": "\nfrom __future__ import annotations\n\nimport platform\nimport sqlite3\nimport threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional...", + "content": "\nfrom __future__ import annotations\n\nimport platform\nimport sqlite3\nimport threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional\n\nfrom codexlens.errors import StorageError\n\n\n@dataclass\nclass ProjectInfo:\n\n id: int\n source_root: Path\n index_root: Path\n created_at: float\n last_indexed: float\n total_files: int\n total_dirs: int\n status: str\n\n\n@dataclass\nclass DirMapping:\n\n id: int\n project_id: int\n source_path: Path\n index_path: Path\n depth: int\n files_count: int\n last_updated: float\n\n\nclass RegistryStore:\n\n DEFAULT_DB_PATH = Path.home() / \".codexlens\" / \"registry.db\"\n\n def __init__(self, db_path: Path | None = None) -> None:\n self.db_path = (db_path or self.DEFAULT_DB_PATH).resolve()\n self._lock = threading.RLock()\n self._local = threading.local()\n self._pool_lock = threading.Lock()\n self._pool: Dict[int, sqlite3.Connection] = {}\n self._pool_generation = 0\n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_001_normalize_keywords.py", + "score": 0.013678451178451179, + "excerpt": " PRIMARY KEY (file_id, keyword_id),\n FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,\n FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCAD...", + "content": " PRIMARY KEY (file_id, keyword_id),\n FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,\n FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE\n )\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", + "score": 0.013661202185792351, + "excerpt": " project_root: Project root directory path\n symbol_name: Name of the symbol to find references for\n symbol_kind: Optional symbol kind filter (e.g., 'function', 'class')\n in...", + "content": "def find_references(\n project_root: str,\n symbol_name: str,\n symbol_kind: Optional[str] = None,\n include_definition: bool = True,\n group_by_definition: bool = True,\n limit: int = 100,\n) -> List[GroupedReferences]:\n \"\"\"Find all reference locations for a symbol.\n\n Multi-definition case returns grouped results to resolve ambiguity.\n\n This function wraps ChainSearchEngine.search_references() and groups\n the results by definition location. Each GroupedReferences contains\n a definition and all references that point to it.\n\n Args:\n project_root: Project root directory path\n symbol_name: Name of the symbol to find references for\n symbol_kind: Optional symbol kind filter (e.g., 'function', 'class')\n include_definition: Whether to include the definition location\n in the result (default True)\n group_by_definition: Whether to group references by definition.\n If False, returns a single group with all references.\n (default True)\n limit: Maximum number of references to return (default 100)\n\n Returns:\n List of GroupedReferences. Each group contains:\n - definition: The DefinitionResult for this symbol definition\n - references: List of ReferenceResult pointing to this definition\n\n Raises:\n ValueError: If project_root does not exist or is not a directory\n\n Examples:\n >>> refs = find_references(\"/path/to/project\", \"authenticate\")\n >>> for group in refs:\n ... print(f\"Definition: {group.definition.file_path}:{group.definition.line}\")\n ... for ref in group.references:\n ... print(f\" Reference: {ref.file_path}:{ref.line} ({ref.relationship})\")\n\n Note:\n Reference relationship types are normalized:\n - 'calls' -> 'call'\n - 'imports' -> 'import'\n - 'inherits' -> 'inheritance'\n \"\"\"\n # Validate and resolve project root\n project_path = resolve_project(project_root)\n\n # Import here to avoid circular imports\n from codexlens.config import Config\n from codexlens.storage.registry import RegistryStore\n from codexlens.storage.path_mapper import PathMapper\n from codexlens.storage.global_index import GlobalSymbolIndex\n from codexlens.search.chain_search import ChainSearchEngine\n from codexlens.search.chain_search import ReferenceResult as RawReferenceResult\n from codexlens.entities import Symbol\n\n # Initialize infrastructure\n config = Config()\n registry = RegistryStore()\n mapper = PathMapper(config.index_dir)\n\n # Create chain search engine\n engine = ChainSearchEngine(registry, mapper, config=config)\n\n try:\n # Step 1: Find definitions for the symbol\n definitions: List[DefinitionResult] = []\n\n if include_definition or group_by_definition:\n # Search for symbol definitions\n symbols = engine.search_symbols(\n name=symbol_name,\n source_path=project_path,\n kind=symbol_kind,\n )\n\n # Convert Symbol to DefinitionResult\n for sym in symbols:\n # Only include exact name matches for definitions\n if sym.name != symbol_name:\n continue\n\n # Optionally filter by kind\n if symbol_kind and sym.kind != symbol_kind:\n continue\n\n definitions.append(DefinitionResult(\n name=sym.name,\n kind=sym.kind,\n file_path=sym.file or \"\",\n line=sym.range[0] if sym.range else 1,\n end_line=sym.range[1] if sym.range else 1,\n signature=None, # Not available from Symbol\n container=None, # Not available from Symbol\n score=1.0,\n ))\n\n # Step 2: Get all references using ChainSearchEngine\n raw_references = engine.search_references(\n symbol_name=symbol_name,\n source_path=project_path,\n depth=-1,\n limit=limit,\n )\n\n # Step 3: Transform raw references to API ReferenceResult\n api_references: List[ReferenceResult] = []\n for raw_ref in raw_references:\n api_ref = _transform_to_reference_result(raw_ref)\n api_references.append(api_ref)\n\n # Step 4: Group references by definition\n if group_by_definition and definitions:\n return _group_references_by_definition(\n definitions=definitions,\n references=api_references,\n include_definition=include_definition,\n )\n else:\n # Return single group with placeholder definition or first definition\n if definitions:\n definition = definitions[0]\n else:\n # Create placeholder definition when no definition found\n definition = DefinitionResult(\n name=symbol_name,\n kind=symbol_kind or \"unknown\",\n file_path=\"\",\n line=0,\n end_line=0,\n signature=None,\n container=None,\n score=0.0,\n )\n\n return [GroupedReferences(\n definition=definition,\n references=api_references,\n )]\n\n finally:\n engine.close()", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\env_config.py", + "score": 0.01359062143375869, + "excerpt": " \n return default\n\n\ndef get_api_config(\n prefix: str,\n *,\n workspace_root: Path | None = None,\n defaults: Dict[str, Any] | None = None,\n) -> Dict[str, Any]:\n defaults = defaults o...", + "content": " \n return default\n\n\ndef get_api_config(\n prefix: str,\n *,\n workspace_root: Path | None = None,\n defaults: Dict[str, Any] | None = None,\n) -> Dict[str, Any]:\n defaults = defaults or {}\n \n config: Dict[str, Any] = {}\n \n \n field_mapping = {\n \"api_key\": f\"{prefix}_API_KEY\",\n \"api_base\": f\"{prefix}_API_BASE\",\n \"model\": f\"{prefix}_MODEL\",\n \"provider\": f\"{prefix}_PROVIDER\",\n \"timeout\": f\"{prefix}_TIMEOUT\",\n }\n \n for field, env_key in field_mapping.items():\n value = get_env(env_key, workspace_root=workspace_root)\n if value is not None:\n \n if field == \"timeout\":\n try:\n config[field] = float(value)\n except ValueError:\n pass\n else:\n config[field] = value\n elif field in defaults:\n config[field] = defaults[field]\n \n return config\n\n\ndef generate_env_example() -> str:\n lines = [\n \"# CodexLens Environment Configuration\",\n \"# Copy this file to .codexlens/.env and fill in your values\",\n \"\",\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\__init__.py", + "score": 0.01359062143375869, + "excerpt": " \"FileContextResult\",\n \"DefinitionResult\",\n \"ReferenceResult\",\n \"GroupedReferences\",\n \"SymbolInfo\",\n \"HoverInfo\",\n \"SemanticResult\",\n \n \"resolve_project\",\n \"normalize_rel...", + "content": " \"FileContextResult\",\n \"DefinitionResult\",\n \"ReferenceResult\",\n \"GroupedReferences\",\n \"SymbolInfo\",\n \"HoverInfo\",\n \"SemanticResult\",\n \n \"resolve_project\",\n \"normalize_relationship_type\",\n \"rank_by_proximity\",\n \"rank_by_score\",\n \n \"find_definition\",\n \"workspace_symbols\",\n \"get_hover\",\n \"file_context\",\n \"find_references\",\n \"semantic_search\",\n]\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_005_cleanup_unused_fields.py", + "score": 0.013517665130568358, + "excerpt": "\nimport logging\nfrom sqlite3 import Connection\n\nlog = logging.getLogger(__name__)\n\n\ndef upgrade(db_conn: Connection):\n cursor = db_conn.cursor()\n\n \n log.info(\"Checking semantic_metadata.keywo...", + "content": "\nimport logging\nfrom sqlite3 import Connection\n\nlog = logging.getLogger(__name__)\n\n\ndef upgrade(db_conn: Connection):\n cursor = db_conn.cursor()\n\n \n log.info(\"Checking semantic_metadata.keywords column...\")\n\n cursor.execute(\n \"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'\"\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py", + "score": 0.013495897553868569, + "excerpt": " >>> \n >>> from codexlens.search.clustering import FrequencyConfig\n >>> freq_config = FrequencyConfig(min_frequency=2, group_by=\"symbol\")\n >>> strategy = get_strategy(\"freq...", + "content": " >>> \n >>> from codexlens.search.clustering import FrequencyConfig\n >>> freq_config = FrequencyConfig(min_frequency=2, group_by=\"symbol\")\n >>> strategy = get_strategy(\"frequency\", freq_config)\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\onnx_reranker.py", + "score": 0.013480392156862746, + "excerpt": "\n from optimum.onnxruntime import ORTModelForSequenceClassification\n from transformers import AutoTokenizer\n\n if self.providers is None:\n from ..gpu_sup...", + "content": "\n from optimum.onnxruntime import ORTModelForSequenceClassification\n from transformers import AutoTokenizer\n\n if self.providers is None:\n from ..gpu_support import get_optimal_providers\n\n \n self.providers = get_optimal_providers(\n use_gpu=self.use_gpu, with_device_options=True\n )\n\n \n \n model_kwargs: dict[str, Any] = {}\n try:\n params = signature(ORTModelForSequenceClassification.from_pretrained).parameters\n if \"providers\" in params:\n model_kwargs[\"providers\"] = self.providers\n elif \"provider\" in params:\n provider_name = \"CPUExecutionProvider\"\n if self.providers:\n first = self.providers[0]\n provider_name = first[0] if isinstance(first, tuple) else str(first)\n model_kwargs[\"provider\"] = provider_name\n except Exception:\n model_kwargs = {}\n\n try:\n self._model = ORTModelForSequenceClassification.from_pretrained(\n self.model_name,\n **model_kwargs,\n )\n except TypeError:\n \n self._model = ORTModelForSequenceClassification.from_pretrained(self.model_name)\n\n self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)\n\n \n input_names: set[str] | None = None\n for attr in (\"input_names\", \"model_input_names\"):\n names = getattr(self._model, attr, None)\n if isinstance(names, (list, tuple)) and names:\n input_names = {str(n) for n in names}\n break\n if input_names is None:\n try:\n session = getattr(self._model, \"model\", None)\n if session is not None and hasattr(session, \"get_inputs\"):\n input_names = {i.name for i in session.get_inputs()}\n except Exception:\n input_names = None\n self._model_input_names = input_names\n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", + "score": 0.013403880070546739, + "excerpt": " return True\n\n except Exception as e:\n raise StorageError(f\"Failed to load ANN index: {e}\")\n\n def count(self) -> int:\n with self._lock:\n r...", + "content": " return True\n\n except Exception as e:\n raise StorageError(f\"Failed to load ANN index: {e}\")\n\n def count(self) -> int:\n with self._lock:\n return self._current_count\n\n @property\n def capacity(self) -> int:\n with self._lock:\n return self._max_elements\n\n @property\n def usage_ratio(self) -> float:\n with self._lock:\n if self._max_elements == 0:\n return 0.0\n return self._current_count / self._max_elements\n\n @property\n def is_loaded(self) -> bool:\n with self._lock:\n return self._index is not None and self._current_count > 0\n\n\n\nclass BinaryANNIndex:\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\litellm_embedder.py", + "score": 0.01322751322751323, + "excerpt": "class LiteLLMEmbedderWrapper(BaseEmbedder):\n \"\"\"Wrapper for ccw-litellm LiteLLMEmbedder.\n", + "content": "class LiteLLMEmbedderWrapper(BaseEmbedder):\n \"\"\"Wrapper for ccw-litellm LiteLLMEmbedder.\n\n This wrapper adapts the ccw-litellm LiteLLMEmbedder to the CodexLens\n BaseEmbedder interface, enabling seamless integration with CodexLens\n semantic search functionality.\n\n Args:\n model: Model identifier for LiteLLM (default: \"default\")\n **kwargs: Additional arguments passed to LiteLLMEmbedder\n\n Raises:\n ImportError: If ccw-litellm package is not installed\n \"\"\"\n\n def __init__(self, model: str = \"default\", **kwargs) -> None:\n \"\"\"Initialize LiteLLM embedder wrapper.\n\n Args:\n model: Model identifier for LiteLLM (default: \"default\")\n **kwargs: Additional arguments passed to LiteLLMEmbedder\n\n Raises:\n ImportError: If ccw-litellm package is not installed\n \"\"\"\n try:\n from ccw_litellm import LiteLLMEmbedder\n self._embedder = LiteLLMEmbedder(model=model, **kwargs)\n except ImportError as e:\n raise ImportError(\n \"ccw-litellm not installed. Install with: pip install ccw-litellm\"\n ) from e\n\n @property\n def embedding_dim(self) -> int:\n \"\"\"Return embedding dimensions from LiteLLMEmbedder.\n\n Returns:\n int: Dimension of the embedding vectors.\n \"\"\"\n return self._embedder.dimensions\n\n @property\n def model_name(self) -> str:\n \"\"\"Return model name from LiteLLMEmbedder.\n\n Returns:\n str: Name or identifier of the underlying model.\n \"\"\"\n return self._embedder.model_name\n\n @property\n def max_tokens(self) -> int:\n \"\"\"Return maximum token limit for the embedding model.\n\n Returns:\n int: Maximum number of tokens that can be embedded at once.\n Reads from LiteLLM config's max_input_tokens property.\n \"\"\"\n # Get from LiteLLM embedder's max_input_tokens property (now exposed)\n if hasattr(self._embedder, 'max_input_tokens'):\n return self._embedder.max_input_tokens\n\n # Fallback: infer from model name\n model_name_lower = self.model_name.lower()\n\n # Large models (8B or \"large\" in name)\n if '8b' in model_name_lower or 'large' in model_name_lower:\n return 32768\n\n # OpenAI text-embedding-3-* models\n if 'text-embedding-3' in model_name_lower:\n return 8191\n\n # Default fallback\n return 8192\n\n def _sanitize_text(self, text: str) -> str:\n \"\"\"Sanitize text to work around ModelScope API routing bug.\n\n ModelScope incorrectly routes text starting with lowercase 'import'\n to an Ollama endpoint, causing failures. This adds a leading space\n to work around the issue without affecting embedding quality.\n\n Args:\n text: Text to sanitize.\n\n Returns:\n Sanitized text safe for embedding API.\n \"\"\"\n if text.startswith('import'):\n return ' ' + text\n return text\n\n def embed_to_numpy(self, texts: str | Iterable[str], **kwargs) -> np.ndarray:\n \"\"\"Embed texts to numpy array using LiteLLMEmbedder.\n\n Args:\n texts: Single text or iterable of texts to embed.\n **kwargs: Additional arguments (ignored for LiteLLM backend).\n Accepts batch_size for API compatibility with fastembed.\n\n Returns:\n numpy.ndarray: Array of shape (n_texts, embedding_dim) containing embeddings.\n \"\"\"\n if isinstance(texts, str):\n texts = [texts]\n else:\n texts = list(texts)\n\n # Sanitize texts to avoid ModelScope routing bug\n texts = [self._sanitize_text(t) for t in texts]\n\n # LiteLLM handles batching internally, ignore batch_size parameter\n return self._embedder.embed(texts)\n\n def embed_single(self, text: str) -> list[float]:\n \"\"\"Generate embedding for a single text.\n\n Args:\n text: Text to embed.\n\n Returns:\n list[float]: Embedding vector as a list of floats.\n \"\"\"\n # Sanitize text before embedding\n sanitized = self._sanitize_text(text)\n embedding = self._embedder.embed([sanitized])\n return embedding[0].tolist()", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\association_tree\\builder.py", + "score": 0.013083213083213086, + "excerpt": "\nfrom __future__ import annotations\n\nimport asyncio\nimport logging\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set\n\nfrom codexlens.hybrid_search.data_structures import CallHierar...", + "content": "\nfrom __future__ import annotations\n\nimport asyncio\nimport logging\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set\n\nfrom codexlens.hybrid_search.data_structures import CallHierarchyItem, Range\nfrom codexlens.lsp.standalone_manager import StandaloneLspManager\nfrom .data_structures import CallTree, TreeNode\n\nlogger = logging.getLogger(__name__)\n\n\nclass AssociationTreeBuilder:\n\n def __init__(\n self,\n lsp_manager: StandaloneLspManager,\n timeout: float = 5.0,\n ):\n self.lsp_manager = lsp_manager\n self.timeout = timeout\n self.visited: Set[str] = set()\n\n async def build_tree(\n self,\n seed_file_path: str,\n seed_line: int,\n seed_character: int = 1,\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py", + "score": 0.012885154061624651, + "excerpt": "import re\nimport sqlite3\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple\n\ntry:\n from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser\nexcept Exception:...", + "content": "import re\nimport sqlite3\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple\n\ntry:\n from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser\nexcept Exception: \n TreeSitterSymbolParser = None \n\n\nclass SymbolExtractor:\n\n \n PATTERNS = {\n 'python': {\n 'function': r'^(?:async\\s+)?def\\s+(\\w+)\\s*\\(',\n 'class': r'^class\\s+(\\w+)\\s*[:\\(]',\n 'import': r'^(?:from\\s+([\\w.]+)\\s+)?import\\s+([\\w.,\\s]+)',\n 'call': r'(? Tuple[str, Optional[str], Optional[int], Optional[int]]:\n return (result.path, result.symbol_name, result.start_line, result.end_line)\n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py", + "score": 0.008680555555555556, + "excerpt": "from typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:", + "content": "\"\"\"HDBSCAN-based clustering strategy for search results.\n\nHDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise)\nis the primary clustering strategy for grouping similar search results.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\nclass HDBSCANStrategy(BaseClusteringStrategy):\n \"\"\"HDBSCAN-based clustering strategy.\n\n Uses HDBSCAN algorithm to cluster search results based on embedding similarity.", + "source": null, + "symbol": null + } + ], + "stats": { + "dirs_searched": 17, + "files_matched": 50, + "time_ms": 6371.967315673828 + } + } +} diff --git a/codex-lens/_tmp_search2.json b/codex-lens/_tmp_search2.json new file mode 100644 index 00000000..b2cd26ce --- /dev/null +++ b/codex-lens/_tmp_search2.json @@ -0,0 +1,415 @@ +{ + "success": true, + "result": { + "query": "class Config", + "method": "cascade", + "count": 50, + "results": [ + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\hybrid_search\\data_structures.py", + "score": 0.06081658330145309, + "excerpt": " @classmethod\n def from_dict(cls, data: Dict[str, Any]) -> \"CallHierarchyItem\":\n return cls(\n name=data[\"name\"],\n kind=data[\"kind\"],\n file_path=data[\"file...", + "content": " @classmethod\n def from_dict(cls, data: Dict[str, Any]) -> \"CallHierarchyItem\":\n return cls(\n name=data[\"name\"],\n kind=data[\"kind\"],\n file_path=data[\"file_path\"],\n range=Range.from_dict(data[\"range\"]),\n detail=data.get(\"detail\"),\n )\n\n\n@dataclass\nclass CodeSymbolNode:\n\n id: str\n name: str\n kind: str\n file_path: str\n range: Range\n embedding: Optional[List[float]] = None\n raw_code: str = \"\"\n docstring: str = \"\"\n score: float = 0.0\n\n def __post_init__(self) -> None:\n if not self.id:\n raise ValueError(\"id cannot be empty\")\n if not self.name:\n raise ValueError(\"name cannot be empty\")\n if not self.kind:\n raise ValueError(\"kind cannot be empty\")\n if not self.file_path:\n raise ValueError(\"file_path cannot be empty\")\n\n def __hash__(self) -> int:\n return hash(self.id)\n\n def __eq__(self, other: object) -> bool:\n if not isinstance(other, CodeSymbolNode):\n return False\n return self.id == other.id\n\n def to_dict(self) -> Dict[str, Any]:\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", + "score": 0.056576452190618645, + "excerpt": "from rich.table import Table\n\nfrom codexlens.config import Config\nfrom codexlens.entities import IndexedFile, SearchResult, Symbol\nfrom codexlens.errors import CodexLensError, ConfigError, ParseError,...", + "content": "import os\nimport shutil\nimport sqlite3\nfrom pathlib import Path\nfrom typing import Annotated, Any, Dict, Iterable, List, Optional\n\nimport typer\nfrom rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn\nfrom rich.table import Table\n\nfrom codexlens.config import Config\nfrom codexlens.entities import IndexedFile, SearchResult, Symbol\nfrom codexlens.errors import CodexLensError, ConfigError, ParseError, StorageError, SearchError\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore, ProjectInfo\nfrom codexlens.storage.index_tree import IndexTreeBuilder\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.search.chain_search import ChainSearchEngine, SearchOptions\nfrom codexlens.watcher import WatcherManager, WatcherConfig\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\config.py", + "score": 0.05655744432847353, + "excerpt": "\"\"\"Configuration system for CodexLens.\"\"\"\n\nfrom __future__ import annotations", + "content": "\"\"\"Configuration system for CodexLens.\"\"\"\n\nfrom __future__ import annotations\n\nimport json\nimport logging\nimport os\nfrom dataclasses import dataclass, field\nfrom functools import cached_property\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", + "score": 0.049219375000264694, + "excerpt": "\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CH...", + "content": "\"\"\"Chain search engine for recursive multi-directory searching.\n\nProvides parallel search across directory hierarchies using indexed _index.db files.\nSupports depth-limited traversal, result aggregation, and symbol search.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CHECKING\nimport json\nimport logging\nimport os\nimport time\n\nfrom codexlens.entities import SearchResult, Symbol\n\nif TYPE_CHECKING:", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\embedding.py", + "score": 0.047931429239828446, + "excerpt": " def __init__(\n self,\n model_name: Optional[str] = None,\n use_gpu: bool = True,\n expand_dim: bool = True,\n ) -> None:\n from codexlens.semantic import SEMANTIC_...", + "content": " def __init__(\n self,\n model_name: Optional[str] = None,\n use_gpu: bool = True,\n expand_dim: bool = True,\n ) -> None:\n from codexlens.semantic import SEMANTIC_AVAILABLE\n\n if not SEMANTIC_AVAILABLE:\n raise ImportError(\n \"Semantic search dependencies not available. \"\n \"Install with: pip install codexlens[semantic]\"\n )\n\n self._model_name = model_name or self.DEFAULT_MODEL\n self._use_gpu = use_gpu\n self._expand_dim = expand_dim\n self._model = None\n self._native_dim: Optional[int] = None\n\n \n self._expansion_matrix: Optional[np.ndarray] = None\n\n @property\n def model_name(self) -> str:\n return self._model_name\n\n @property\n def embedding_dim(self) -> int:\n if self._expand_dim:\n return self.TARGET_DIM\n \n if self._native_dim is not None:\n return self._native_dim\n \n model_dims = {\n \"BAAI/bge-large-en-v1.5\": 1024,\n \"BAAI/bge-base-en-v1.5\": 768,\n \"BAAI/bge-small-en-v1.5\": 384,\n \"intfloat/multilingual-e5-large\": 1024,\n }\n return model_dims.get(self._model_name, 1024)\n\n @property\n def max_tokens(self) -> int:\n return 512 \n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", + "score": 0.04283104206542711, + "excerpt": "import threading\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom typing import Any, Dict, Iterable, List, Optional", + "content": "Provides intelligent load balancing across multiple LiteLLM embedding endpoints\nto maximize throughput while respecting rate limits.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport random\nimport threading\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom typing import Any, Dict, Iterable, List, Optional\n\nimport numpy as np\n\nfrom .base import BaseEmbedder\n\nlogger = logging.getLogger(__name__)\n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", + "score": 0.036886112765573215, + "excerpt": "- Direct subprocess spawning of language servers\n- JSON-RPC 2.0 communication over stdin/stdout\n- Multi-language support via configuration file (lsp-servers.json)\n- Process lifecycle management with a...", + "content": "\"\"\"Standalone Language Server Manager for direct LSP communication.\n\nThis module provides direct communication with language servers via JSON-RPC over stdio,\neliminating the need for VSCode Bridge. Similar to cclsp architecture.\n\nFeatures:\n- Direct subprocess spawning of language servers\n- JSON-RPC 2.0 communication over stdin/stdout\n- Multi-language support via configuration file (lsp-servers.json)\n- Process lifecycle management with auto-restart\n- Compatible interface with existing LspBridge\n\"\"\"\n\nfrom __future__ import annotations\n\nimport asyncio\nimport json\nimport logging\nimport os", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\models.py", + "score": 0.03457410829143062, + "excerpt": " container: Containing class/module (if any)\n score: Match score for ranking\n return {k: v for k, v in asdict(self).items() if v is not None}\n\n\n# =================================...", + "content": " container: Containing class/module (if any)\n score: Match score for ranking\n return {k: v for k, v in asdict(self).items() if v is not None}\n\n\n# =============================================================================\n# Section 4.4: find_references dataclasses\n# =============================================================================\n\n@dataclass\nclass ReferenceResult:\n file_path: str\n line: int\n column: int\n context_line: str\n relationship: str # call | import | type_annotation | inheritance\n\n def to_dict(self) -> dict:\n return asdict(self)\n\n\n@dataclass\nclass GroupedReferences:\n definition: DefinitionResult\n references: List[ReferenceResult] = field(default_factory=list)\n\n def to_dict(self) -> dict:\n return {\n \"definition\": self.definition.to_dict(),\n \"references\": [r.to_dict() for r in self.references],\n }\n\n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py", + "score": 0.03341093379138448, + "excerpt": "\n if TREE_SITTER_AVAILABLE:\n self._initialize_parser()\n\n def _initialize_parser(self) -> None:\n if TreeSitterParser is None or TreeSitterLanguage is None:\n retur...", + "content": "\n if TREE_SITTER_AVAILABLE:\n self._initialize_parser()\n\n def _initialize_parser(self) -> None:\n if TreeSitterParser is None or TreeSitterLanguage is None:\n return\n\n try:\n \n if self.language_id == \"python\":\n import tree_sitter_python\n self._language = TreeSitterLanguage(tree_sitter_python.language())\n elif self.language_id == \"javascript\":\n import tree_sitter_javascript\n self._language = TreeSitterLanguage(tree_sitter_javascript.language())\n elif self.language_id == \"typescript\":\n import tree_sitter_typescript\n \n if self.path is not None and self.path.suffix.lower() == \".tsx\":\n self._language = TreeSitterLanguage(tree_sitter_typescript.language_tsx())\n else:\n self._language = TreeSitterLanguage(tree_sitter_typescript.language_typescript())\n else:\n return\n\n \n self._parser = TreeSitterParser()\n if hasattr(self._parser, \"set_language\"):\n self._parser.set_language(self._language) \n else:\n self._parser.language = self._language \n\n except Exception:\n \n self._parser = None\n self._language = None\n\n def is_available(self) -> bool:\n return self._parser is not None and self._language is not None\n\n def _parse_tree(self, text: str) -> Optional[tuple[bytes, TreeSitterNode]]:\n if not self.is_available() or self._parser is None:\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py", + "score": 0.029568673189485736, + "excerpt": "\nimport logging\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import List, Optional", + "content": "\"\"\"Incremental indexer for processing file changes.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import List, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.storage.global_index import GlobalSymbolIndex\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py", + "score": 0.029334400167733504, + "excerpt": "\nfrom collections import defaultdict\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Literal\n", + "content": "\nUse cases:\n- Prioritize commonly called methods/functions\n- Filter out one-off results that may be less relevant\n- Deduplicate results pointing to the same symbol from different locations\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom collections import defaultdict\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Literal\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\n@dataclass", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\association_tree\\data_structures.py", + "score": 0.027925539288870704, + "excerpt": "\n def __len__(self) -> int:\n return len(self.all_nodes)\n\n def __repr__(self) -> str:\n return (\n f\"CallTree(roots={len(self.roots)}, nodes={len(self.all_nodes)}, \"\n ...", + "content": "\n def __len__(self) -> int:\n return len(self.all_nodes)\n\n def __repr__(self) -> str:\n return (\n f\"CallTree(roots={len(self.roots)}, nodes={len(self.all_nodes)}, \"\n f\"depth={self.depth_reached})\"\n )\n\n\n@dataclass\nclass UniqueNode:\n\n file_path: str\n name: str\n kind: str\n range: Range\n min_depth: int = 0\n occurrences: int = 1\n paths: List[List[str]] = field(default_factory=list)\n context_nodes: List[str] = field(default_factory=list)\n score: float = 0.0\n\n @property\n def node_key(self) -> tuple[str, int, int]:\n return (\n self.file_path,\n self.range.start_line,\n self.range.end_line,\n )\n\n def add_path(self, path: List[str]) -> None:\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", + "score": 0.024369821963687643, + "excerpt": "def _get_configured_index_root() -> Path:\n \"\"\"Get the index root from environment or config file.\n", + "content": "def _get_configured_index_root() -> Path:\n \"\"\"Get the index root from environment or config file.\n\n Priority order:\n 1. CODEXLENS_INDEX_DIR environment variable\n 2. index_dir from ~/.codexlens/config.json\n 3. Default: ~/.codexlens/indexes\n \"\"\"\n env_override = os.getenv(\"CODEXLENS_INDEX_DIR\")\n if env_override:\n return Path(env_override).expanduser().resolve()\n\n config_file = Path.home() / \".codexlens\" / \"config.json\"\n if config_file.exists():\n try:\n cfg = json.loads(config_file.read_text(encoding=\"utf-8\"))\n if \"index_dir\" in cfg:\n return Path(cfg[\"index_dir\"]).expanduser().resolve()\n except (json.JSONDecodeError, OSError):\n pass\n\n return Path.home() / \".codexlens\" / \"indexes\"", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", + "score": 0.023949795081967214, + "excerpt": "class VectorStore:\n \"\"\"SQLite-based vector storage with HNSW-accelerated similarity search.\n", + "content": "class VectorStore:\n \"\"\"SQLite-based vector storage with HNSW-accelerated similarity search.\n\n Performance optimizations:\n - HNSW index for O(log N) approximate nearest neighbor search\n - Embedding matrix cached in memory for batch similarity computation (fallback)\n - NumPy vectorized operations instead of Python loops (fallback)\n - Lazy content loading - only fetch full content for top-k results\n - Thread-safe cache invalidation\n - Bulk insert mode for efficient batch operations\n \"\"\"\n\n # Default embedding dimension (used when creating new index)\n DEFAULT_DIM = 768\n\n def __init__(self, db_path: str | Path) -> None:\n if not NUMPY_AVAILABLE:\n raise ImportError(\n \"Semantic search dependencies not available. \"\n \"Install with: pip install codexlens[semantic]\"\n )\n\n self.db_path = Path(db_path)\n self.db_path.parent.mkdir(parents=True, exist_ok=True)\n\n # Embedding cache for fast similarity search (fallback)\n self._cache_lock = threading.RLock()\n self._embedding_matrix: Optional[np.ndarray] = None\n self._embedding_norms: Optional[np.ndarray] = None\n self._chunk_ids: Optional[List[int]] = None\n self._cache_version: int = 0\n\n # ANN index for O(log N) search\n self._ann_index: Optional[ANNIndex] = None\n self._ann_dim: Optional[int] = None\n self._ann_write_lock = threading.Lock() # Protects ANN index modifications\n\n # Bulk insert mode tracking\n self._bulk_insert_mode: bool = False\n self._bulk_insert_ids: List[int] = []\n self._bulk_insert_embeddings: List[np.ndarray] = []\n\n self._init_schema()\n self._init_ann_index()\n\n def _init_schema(self) -> None:\n \"\"\"Initialize vector storage schema.\"\"\"\n with sqlite3.connect(self.db_path) as conn:\n # Enable memory mapping for faster reads\n conn.execute(\"PRAGMA mmap_size = 30000000000\") # 30GB limit\n conn.execute(\"\"\"\n CREATE TABLE IF NOT EXISTS semantic_chunks (\n id INTEGER PRIMARY KEY AUTOINCREMENT,\n file_path TEXT NOT NULL,\n content TEXT NOT NULL,\n embedding BLOB NOT NULL,\n metadata TEXT,\n category TEXT DEFAULT 'code',\n created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n )\n \"\"\")\n conn.execute(\"\"\"\n CREATE INDEX IF NOT EXISTS idx_chunks_file\n ON semantic_chunks(file_path)\n \"\"\")\n conn.execute(\"\"\"\n CREATE INDEX IF NOT EXISTS idx_chunks_category\n ON semantic_chunks(category)\n \"\"\")\n # Model configuration table - tracks which model generated the embeddings\n conn.execute(\"\"\"\n CREATE TABLE IF NOT EXISTS embeddings_config (\n id INTEGER PRIMARY KEY CHECK (id = 1),\n model_profile TEXT NOT NULL,\n model_name TEXT NOT NULL,\n embedding_dim INTEGER NOT NULL,\n backend TEXT NOT NULL DEFAULT 'fastembed',\n created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n )\n \"\"\")\n\n # Migration: Add backend column to existing tables\n self._migrate_backend_column(conn)\n # Migration: Add category column\n self._migrate_category_column(conn)\n\n conn.commit()\n\n def _migrate_backend_column(self, conn: sqlite3.Connection) -> None:\n \"\"\"Add backend column to existing embeddings_config table if not present.\n\n Args:\n conn: Active SQLite connection\n \"\"\"\n # Check if backend column exists\n cursor = conn.execute(\"PRAGMA table_info(embeddings_config)\")\n columns = [row[1] for row in cursor.fetchall()]\n\n if 'backend' not in columns:\n logger.info(\"Migrating embeddings_config table: adding backend column\")\n conn.execute(\"\"\"\n ALTER TABLE embeddings_config\n ADD COLUMN backend TEXT NOT NULL DEFAULT 'fastembed'\n \"\"\")\n\n def _migrate_category_column(self, conn: sqlite3.Connection) -> None:\n \"\"\"Add category column to existing semantic_chunks table if not present.\n\n Args:\n conn: Active SQLite connection\n \"\"\"\n # Check if category column exists\n cursor = conn.execute(\"PRAGMA table_info(semantic_chunks)\")\n columns = [row[1] for row in cursor.fetchall()]\n\n if 'category' not in columns:\n logger.info(\"Migrating semantic_chunks table: adding category column\")\n conn.execute(\"\"\"\n ALTER TABLE semantic_chunks\n ADD COLUMN category TEXT DEFAULT 'code'\n \"\"\")\n # Create index for fast category filtering\n conn.execute(\"\"\"\n CREATE INDEX IF NOT EXISTS idx_chunks_category\n ON semantic_chunks(category)\n \"\"\")\n\n def _init_ann_index(self) -> None:\n \"\"\"Initialize ANN index (lazy loading from existing data).\"\"\"\n if not HNSWLIB_AVAILABLE:\n logger.debug(\"hnswlib not available, using brute-force search\")\n return\n\n # Try to detect embedding dimension from existing data\n dim = self._detect_embedding_dim()\n if dim is None:\n # No data yet, will initialize on first add\n logger.debug(\"No embeddings found, ANN index will be created on first add\")\n return\n\n self._ann_dim = dim\n\n try:\n self._ann_index = ANNIndex(self.db_path, dim)\n if self._ann_index.load():\n logger.debug(\n \"Loaded ANN index with %d vectors\", self._ann_index.count()\n )\n else:\n # Index file doesn't exist, try to build from SQLite data\n logger.debug(\"ANN index file not found, rebuilding from SQLite\")\n self._rebuild_ann_index_internal()\n except Exception as e:\n logger.warning(\"Failed to initialize ANN index: %s\", e)\n self._ann_index = None\n\n def _detect_embedding_dim(self) -> Optional[int]:\n \"\"\"Detect embedding dimension from existing data.\"\"\"\n with sqlite3.connect(self.db_path) as conn:\n row = conn.execute(\n \"SELECT embedding FROM semantic_chunks LIMIT 1\"\n ).fetchone()\n if row and row[0]:\n # Embedding is stored as float32 blob\n blob = row[0]\n return len(blob) // np.dtype(np.float32).itemsize\n return None\n\n @property\n def dimension(self) -> Optional[int]:\n \"\"\"Return the dimension of embeddings in the store.\n\n Returns:\n Embedding dimension if available, None if store is empty.\n \"\"\"\n if self._ann_dim is not None:\n return self._ann_dim\n self._ann_dim = self._detect_embedding_dim()\n return self._ann_dim\n\n def _rebuild_ann_index_internal(self) -> int:\n \"\"\"Internal method to rebuild ANN index from SQLite data.\"\"\"\n if self._ann_index is None:\n return 0\n\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\"PRAGMA mmap_size = 30000000000\")\n rows = conn.execute(\n \"SELECT id, embedding FROM semantic_chunks\"\n ).fetchall()\n\n if not rows:\n return 0\n\n # Extract IDs and embeddings\n ids = [r[0] for r in rows]\n embeddings = np.vstack([\n np.frombuffer(r[1], dtype=np.float32) for r in rows\n ])\n\n # Add to ANN index\n self._ann_index.add_vectors(ids, embeddings)\n self._ann_index.save()\n\n logger.info(\"Rebuilt ANN index with %d vectors\", len(ids))\n return len(ids)\n\n def rebuild_ann_index(self) -> int:\n \"\"\"Rebuild HNSW index from all chunks in SQLite.\n\n Use this method to:\n - Migrate existing data to use ANN search\n - Repair corrupted index\n - Reclaim space after many deletions\n\n Returns:\n Number of vectors indexed.\n \"\"\"\n if not HNSWLIB_AVAILABLE:\n logger.warning(\"hnswlib not available, cannot rebuild ANN index\")\n return 0\n\n # Detect dimension\n dim = self._detect_embedding_dim()\n if dim is None:\n logger.warning(\"No embeddings found, cannot rebuild ANN index\")\n return 0\n\n self._ann_dim = dim\n\n # Create new index\n try:\n self._ann_index = ANNIndex(self.db_path, dim)\n return self._rebuild_ann_index_internal()\n except Exception as e:\n logger.error(\"Failed to rebuild ANN index: %s\", e)\n self._ann_index = None\n return 0\n\n def _invalidate_cache(self) -> None:\n \"\"\"Invalidate the embedding cache (thread-safe).\"\"\"\n with self._cache_lock:\n self._embedding_matrix = None\n self._embedding_norms = None\n self._chunk_ids = None\n self._cache_version += 1\n\n def _refresh_cache(self) -> bool:\n \"\"\"Load embeddings into numpy matrix for fast similarity search.\n\n Returns:\n True if cache was refreshed successfully, False if no data.\n \"\"\"\n with self._cache_lock:\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\"PRAGMA mmap_size = 30000000000\")\n rows = conn.execute(\n \"SELECT id, embedding FROM semantic_chunks\"\n ).fetchall()\n\n if not rows:\n self._embedding_matrix = None\n self._embedding_norms = None\n self._chunk_ids = None\n return False\n\n # Extract IDs and embeddings\n self._chunk_ids = [r[0] for r in rows]\n\n # Bulk convert binary blobs to numpy matrix\n embeddings = [\n np.frombuffer(r[1], dtype=np.float32) for r in rows\n ]\n self._embedding_matrix = np.vstack(embeddings)\n\n # Pre-compute norms for faster similarity calculation\n self._embedding_norms = np.linalg.norm(\n self._embedding_matrix, axis=1, keepdims=True\n )\n # Avoid division by zero\n self._embedding_norms = np.where(\n self._embedding_norms == 0, EPSILON, self._embedding_norms\n )\n\n return True\n\n def _ensure_ann_index(self, dim: int) -> bool:\n \"\"\"Ensure ANN index is initialized with correct dimension.\n\n This method is thread-safe and uses double-checked locking.\n\n Args:\n dim: Embedding dimension\n\n Returns:\n True if ANN index is ready, False otherwise\n \"\"\"\n if not HNSWLIB_AVAILABLE:\n return False\n\n # Fast path: index already initialized (no lock needed)\n if self._ann_index is not None:\n return True\n\n # Slow path: acquire lock for initialization\n with self._ann_write_lock:\n # Double-check after acquiring lock\n if self._ann_index is not None:\n return True\n\n try:\n self._ann_dim = dim\n self._ann_index = ANNIndex(self.db_path, dim)\n self._ann_index.load() # Try to load existing\n return True\n except Exception as e:\n logger.warning(\"Failed to initialize ANN index: %s\", e)\n self._ann_index = None\n return False\n\n def add_chunk(\n self, chunk: SemanticChunk, file_path: str, category: str = \"code\"\n ) -> int:\n \"\"\"Add a single chunk with its embedding.\n\n Args:\n chunk: SemanticChunk with embedding\n file_path: Path to the source file\n category: File category ('code' or 'doc'), default 'code'\n\n Returns:\n The inserted chunk ID.\n \"\"\"\n if chunk.embedding is None:\n raise ValueError(\"Chunk must have embedding before adding to store\")\n\n embedding_arr = np.array(chunk.embedding, dtype=np.float32)\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n\n with sqlite3.connect(self.db_path) as conn:\n cursor = conn.execute(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n (file_path, chunk.content, embedding_blob, metadata_json, category)\n )\n conn.commit()\n chunk_id = cursor.lastrowid or 0\n\n # Add to ANN index\n if self._ensure_ann_index(len(chunk.embedding)):\n with self._ann_write_lock:\n try:\n self._ann_index.add_vectors([chunk_id], embedding_arr.reshape(1, -1))\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return chunk_id\n\n def add_chunks(\n self, chunks: List[SemanticChunk], file_path: str, category: str = \"code\"\n ) -> List[int]:\n \"\"\"Add multiple chunks with embeddings (batch insert).\n\n Args:\n chunks: List of SemanticChunk objects with embeddings\n file_path: Path to the source file\n category: File category ('code' or 'doc'), default 'code'\n\n Returns:\n List of inserted chunk IDs.\n \"\"\"\n if not chunks:\n return []\n\n # Prepare batch data\n batch_data = []\n embeddings_list = []\n for chunk in chunks:\n if chunk.embedding is None:\n raise ValueError(\"All chunks must have embeddings\")\n embedding_arr = np.array(chunk.embedding, dtype=np.float32)\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))\n embeddings_list.append(embedding_arr)\n\n # Batch insert to SQLite\n with sqlite3.connect(self.db_path) as conn:\n # Get starting ID before insert\n row = conn.execute(\"SELECT MAX(id) FROM semantic_chunks\").fetchone()\n start_id = (row[0] or 0) + 1\n\n conn.executemany(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n batch_data\n )\n conn.commit()\n # Calculate inserted IDs based on starting ID\n ids = list(range(start_id, start_id + len(chunks)))\n\n # Add to ANN index\n if embeddings_list and self._ensure_ann_index(len(embeddings_list[0])):\n with self._ann_write_lock:\n try:\n embeddings_matrix = np.vstack(embeddings_list)\n self._ann_index.add_vectors(ids, embeddings_matrix)\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add batch to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return ids\n\n def add_chunks_batch(\n self,\n chunks_with_paths: List[Tuple[SemanticChunk, str]],\n update_ann: bool = True,\n auto_save_ann: bool = True,\n categories: Optional[List[str]] = None,\n ) -> List[int]:\n \"\"\"Batch insert chunks from multiple files in a single transaction.\n\n This method is optimized for bulk operations during index generation.\n\n Args:\n chunks_with_paths: List of (chunk, file_path) tuples\n update_ann: If True, update ANN index with new vectors (default: True)\n auto_save_ann: If True, save ANN index after update (default: True).\n Set to False for bulk inserts to reduce I/O overhead.\n categories: Optional list of categories per chunk. If None, defaults to 'code'.\n If provided, must match length of chunks_with_paths.\n\n Returns:\n List of inserted chunk IDs\n \"\"\"\n if not chunks_with_paths:\n return []\n\n batch_size = len(chunks_with_paths)\n\n # Validate categories if provided\n if categories is not None and len(categories) != batch_size:\n raise ValueError(\n f\"categories length ({len(categories)}) must match \"\n f\"chunks_with_paths length ({batch_size})\"\n )\n\n # Prepare batch data\n batch_data = []\n embeddings_list = []\n for i, (chunk, file_path) in enumerate(chunks_with_paths):\n if chunk.embedding is None:\n raise ValueError(\"All chunks must have embeddings\")\n # Optimize: avoid repeated np.array() if already numpy\n if isinstance(chunk.embedding, np.ndarray):\n embedding_arr = chunk.embedding.astype(np.float32)\n else:\n embedding_arr = np.array(chunk.embedding, dtype=np.float32)\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n category = categories[i] if categories else \"code\"\n batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))\n embeddings_list.append(embedding_arr)\n\n # Batch insert to SQLite in single transaction\n with sqlite3.connect(self.db_path) as conn:\n # Get starting ID before insert\n row = conn.execute(\"SELECT MAX(id) FROM semantic_chunks\").fetchone()\n start_id = (row[0] or 0) + 1\n\n _validate_chunk_id_range(start_id, batch_size)\n\n conn.executemany(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n batch_data\n )\n conn.commit()\n # Calculate inserted IDs based on starting ID\n ids = list(range(start_id, start_id + batch_size))\n\n # Handle ANN index updates\n if embeddings_list and update_ann and self._ensure_ann_index(len(embeddings_list[0])):\n with self._ann_write_lock:\n # In bulk insert mode, accumulate for later batch update\n if self._bulk_insert_mode:\n self._bulk_insert_ids.extend(ids)\n self._bulk_insert_embeddings.extend(embeddings_list)\n else:\n # Normal mode: update immediately\n try:\n embeddings_matrix = np.vstack(embeddings_list)\n self._ann_index.add_vectors(ids, embeddings_matrix)\n if auto_save_ann:\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add batch to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return ids\n\n def add_chunks_batch_numpy(\n self,\n chunks_with_paths: List[Tuple[SemanticChunk, str]],\n embeddings_matrix: np.ndarray,\n update_ann: bool = True,\n auto_save_ann: bool = True,\n categories: Optional[List[str]] = None,\n ) -> List[int]:\n \"\"\"Batch insert chunks with pre-computed numpy embeddings matrix.\n\n This method accepts embeddings as a numpy matrix to avoid list->array conversions.\n Useful when embeddings are already in numpy format from batch encoding.\n\n Args:\n chunks_with_paths: List of (chunk, file_path) tuples (embeddings can be None)\n embeddings_matrix: Pre-computed embeddings as (N, D) numpy array\n update_ann: If True, update ANN index with new vectors (default: True)\n auto_save_ann: If True, save ANN index after update (default: True)\n categories: Optional list of categories per chunk. If None, defaults to 'code'.\n\n Returns:\n List of inserted chunk IDs\n \"\"\"\n if not chunks_with_paths:\n return []\n\n batch_size = len(chunks_with_paths)\n\n if len(chunks_with_paths) != embeddings_matrix.shape[0]:\n raise ValueError(\n f\"Mismatch: {len(chunks_with_paths)} chunks but \"\n f\"{embeddings_matrix.shape[0]} embeddings\"\n )\n\n # Validate categories if provided\n if categories is not None and len(categories) != batch_size:\n raise ValueError(\n f\"categories length ({len(categories)}) must match \"\n f\"chunks_with_paths length ({batch_size})\"\n )\n\n # Ensure float32 format\n embeddings_matrix = embeddings_matrix.astype(np.float32)\n\n # Prepare batch data\n batch_data = []\n for i, (chunk, file_path) in enumerate(chunks_with_paths):\n embedding_arr = embeddings_matrix[i]\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n category = categories[i] if categories else \"code\"\n batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))\n\n # Batch insert to SQLite in single transaction\n with sqlite3.connect(self.db_path) as conn:\n # Get starting ID before insert\n row = conn.execute(\"SELECT MAX(id) FROM semantic_chunks\").fetchone()\n start_id = (row[0] or 0) + 1\n\n _validate_chunk_id_range(start_id, batch_size)\n\n conn.executemany(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n batch_data\n )\n conn.commit()\n # Calculate inserted IDs based on starting ID\n ids = list(range(start_id, start_id + batch_size))\n\n # Handle ANN index updates\n if update_ann and self._ensure_ann_index(embeddings_matrix.shape[1]):\n with self._ann_write_lock:\n # In bulk insert mode, accumulate for later batch update\n if self._bulk_insert_mode:\n self._bulk_insert_ids.extend(ids)\n # Split matrix into individual arrays for accumulation\n self._bulk_insert_embeddings.extend([embeddings_matrix[i] for i in range(len(ids))])\n else:\n # Normal mode: update immediately\n try:\n self._ann_index.add_vectors(ids, embeddings_matrix)\n if auto_save_ann:\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add batch to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return ids\n\n def begin_bulk_insert(self) -> None:\n \"\"\"Begin bulk insert mode - disable ANN auto-update for better performance.\n\n Usage:\n store.begin_bulk_insert()\n try:\n for batch in batches:\n store.add_chunks_batch(batch, auto_save_ann=False)\n finally:\n store.end_bulk_insert()\n\n Or use context manager:\n with store.bulk_insert():\n for batch in batches:\n store.add_chunks_batch(batch)\n \"\"\"\n with self._ann_write_lock:\n self._bulk_insert_mode = True\n self._bulk_insert_ids.clear()\n self._bulk_insert_embeddings.clear()\n logger.debug(\"Entered bulk insert mode\")\n\n def end_bulk_insert(self) -> None:\n \"\"\"End bulk insert mode and rebuild ANN index from accumulated data.\n\n This method should be called after all bulk inserts are complete to\n update the ANN index in a single batch operation.\n \"\"\"\n with self._ann_write_lock:\n if not self._bulk_insert_mode:\n logger.warning(\"end_bulk_insert called but not in bulk insert mode\")\n return\n\n self._bulk_insert_mode = False\n bulk_ids = list(self._bulk_insert_ids)\n bulk_embeddings = list(self._bulk_insert_embeddings)\n self._bulk_insert_ids.clear()\n self._bulk_insert_embeddings.clear()\n\n # Update ANN index with accumulated data.\n if bulk_ids and bulk_embeddings:\n if self._ensure_ann_index(len(bulk_embeddings[0])):\n with self._ann_write_lock:\n try:\n embeddings_matrix = np.vstack(bulk_embeddings)\n self._ann_index.add_vectors(bulk_ids, embeddings_matrix)\n self._ann_index.save()\n logger.info(\n \"Bulk insert complete: added %d vectors to ANN index\",\n len(bulk_ids),\n )\n except Exception as e:\n logger.error(\"Failed to update ANN index after bulk insert: %s\", e)\n\n logger.debug(\"Exited bulk insert mode\")\n\n class BulkInsertContext:\n \"\"\"Context manager for bulk insert operations.\"\"\"\n\n def __init__(self, store: \"VectorStore\") -> None:\n self.store = store\n\n def __enter__(self) -> \"VectorStore\":\n self.store.begin_bulk_insert()\n return self.store\n\n def __exit__(self, exc_type, exc_val, exc_tb) -> None:\n self.store.end_bulk_insert()\n\n def bulk_insert(self) -> \"VectorStore.BulkInsertContext\":\n \"\"\"Return a context manager for bulk insert operations.\n\n Usage:\n with store.bulk_insert():\n for batch in batches:\n store.add_chunks_batch(batch)\n \"\"\"\n return self.BulkInsertContext(self)\n\n def delete_file_chunks(self, file_path: str) -> int:\n \"\"\"Delete all chunks for a file.\n\n Returns:\n Number of deleted chunks.\n \"\"\"\n # Get chunk IDs before deletion (for ANN index)\n chunk_ids_to_delete = []\n if self._ann_index is not None:\n with sqlite3.connect(self.db_path) as conn:\n rows = conn.execute(\n \"SELECT id FROM semantic_chunks WHERE file_path = ?\",\n (file_path,)\n ).fetchall()\n chunk_ids_to_delete = [r[0] for r in rows]\n\n # Delete from SQLite\n with sqlite3.connect(self.db_path) as conn:\n cursor = conn.execute(\n \"DELETE FROM semantic_chunks WHERE file_path = ?\",\n (file_path,)\n )\n conn.commit()\n deleted = cursor.rowcount\n\n # Remove from ANN index\n if deleted > 0 and self._ann_index is not None and chunk_ids_to_delete:\n with self._ann_write_lock:\n try:\n self._ann_index.remove_vectors(chunk_ids_to_delete)\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to remove from ANN index: %s\", e)\n\n if deleted > 0:\n self._invalidate_cache()\n return deleted\n\n def search_similar(\n self,\n query_embedding: List[float],\n top_k: int = 10,\n min_score: float = 0.0,\n return_full_content: bool = True,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Find chunks most similar to query embedding.\n\n Uses HNSW index for O(log N) search when available, falls back to\n brute-force NumPy search otherwise.\n\n Args:\n query_embedding: Query vector.\n top_k: Maximum results to return.\n min_score: Minimum cosine similarity score in [0.0, 1.0].\n return_full_content: If True, return full code block content.\n category: Optional category filter ('code' or 'doc'). If None, returns all.\n\n Returns:\n List of SearchResult ordered by similarity (highest first).\n \"\"\"\n query_vec = np.array(query_embedding, dtype=np.float32)\n\n if not 0.0 <= min_score <= 1.0:\n raise ValueError(\n f\"Invalid min_score: {min_score}. Must be within [0.0, 1.0] for cosine similarity.\"\n )\n\n # Try HNSW search first (O(log N))\n if (\n HNSWLIB_AVAILABLE\n and self._ann_index is not None\n and self._ann_index.is_loaded\n and self._ann_index.count() > 0\n ):\n try:\n return self._search_with_ann(\n query_vec, top_k, min_score, return_full_content, category\n )\n except Exception as e:\n logger.warning(\"ANN search failed, falling back to brute-force: %s\", e)\n\n # Fallback to brute-force search (O(N))\n return self._search_brute_force(\n query_vec, top_k, min_score, return_full_content, category\n )\n\n def _search_with_ann(\n self,\n query_vec: np.ndarray,\n top_k: int,\n min_score: float,\n return_full_content: bool,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Search using HNSW index (O(log N)).\n\n Args:\n query_vec: Query vector as numpy array\n top_k: Maximum results to return\n min_score: Minimum cosine similarity score in [0.0, 1.0]\n return_full_content: If True, return full code block content\n category: Optional category filter ('code' or 'doc')\n\n Returns:\n List of SearchResult ordered by similarity (highest first)\n \"\"\"\n # Limit top_k to available vectors to prevent hnswlib error\n ann_count = self._ann_index.count()\n # When category filtering, fetch more candidates to compensate for filtering\n fetch_k = top_k * 3 if category else top_k\n effective_top_k = min(fetch_k, ann_count) if ann_count > 0 else 0\n\n if effective_top_k == 0:\n return []\n\n # HNSW search returns (ids, distances)\n # For cosine space: distance = 1 - similarity\n ids, distances = self._ann_index.search(query_vec, effective_top_k)\n\n if ids is None or distances is None:\n logger.debug(\n \"ANN search returned null results (ids=%s, distances=%s)\",\n ids,\n distances,\n )\n return []\n\n if len(ids) == 0 or len(distances) == 0:\n logger.debug(\n \"ANN search returned empty results (ids=%s, distances=%s)\",\n ids,\n distances,\n )\n return []\n\n if len(ids) != len(distances):\n logger.warning(\n \"ANN search returned mismatched result lengths (%d ids, %d distances)\",\n len(ids),\n len(distances),\n )\n return []\n\n # Convert distances to similarity scores\n scores = [1.0 - d for d in distances]\n\n # Filter by min_score\n filtered = [\n (chunk_id, score)\n for chunk_id, score in zip(ids, scores)\n if score >= min_score\n ]\n\n if not filtered:\n return []\n\n top_ids = [f[0] for f in filtered]\n top_scores = [f[1] for f in filtered]\n\n # Fetch content from SQLite with category filtering\n results = self._fetch_results_by_ids(\n top_ids, top_scores, return_full_content, category\n )\n # Apply final limit after category filtering\n return results[:top_k]\n\n def _search_brute_force(\n self,\n query_vec: np.ndarray,\n top_k: int,\n min_score: float,\n return_full_content: bool,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Brute-force search using NumPy (O(N) fallback).\n\n Args:\n query_vec: Query vector as numpy array\n top_k: Maximum results to return\n min_score: Minimum cosine similarity score in [0.0, 1.0]\n return_full_content: If True, return full code block content\n category: Optional category filter ('code' or 'doc')\n\n Returns:\n List of SearchResult ordered by similarity (highest first)\n \"\"\"\n logger.warning(\n \"Using brute-force vector search (hnswlib not available). \"\n \"This may cause high memory usage for large indexes. \"\n \"Install hnswlib for better performance: pip install hnswlib\"\n )\n\n with self._cache_lock:\n # Refresh cache if needed\n if self._embedding_matrix is None:\n if not self._refresh_cache():\n return [] # No data\n\n # Vectorized cosine similarity\n query_vec = query_vec.reshape(1, -1)\n query_norm = np.linalg.norm(query_vec)\n if query_norm == 0:\n return []\n\n # Compute all similarities at once: (N,) scores\n # similarity = (A @ B.T) / (||A|| * ||B||)\n dot_products = np.dot(self._embedding_matrix, query_vec.T).flatten()\n scores = dot_products / (self._embedding_norms.flatten() * query_norm)\n\n # Filter by min_score and get top-k indices\n valid_mask = scores >= min_score\n valid_indices = np.where(valid_mask)[0]\n\n if len(valid_indices) == 0:\n return []\n\n # When category filtering, fetch more candidates to compensate for filtering\n fetch_k = top_k * 3 if category else top_k\n\n # Sort by score descending and take top candidates\n valid_scores = scores[valid_indices]\n sorted_order = np.argsort(valid_scores)[::-1][:fetch_k]\n top_indices = valid_indices[sorted_order]\n top_scores = valid_scores[sorted_order]\n\n # Get chunk IDs for top results\n top_ids = [self._chunk_ids[i] for i in top_indices]\n\n # Fetch content only for top-k results (lazy loading) with category filtering\n results = self._fetch_results_by_ids(\n top_ids, top_scores.tolist(), return_full_content, category\n )\n # Apply final limit after category filtering\n return results[:top_k]\n\n def _fetch_results_by_ids(\n self,\n chunk_ids: List[int],\n scores: List[float],\n return_full_content: bool,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Fetch full result data for specific chunk IDs.\n\n Args:\n chunk_ids: List of chunk IDs to fetch.\n scores: Corresponding similarity scores.\n return_full_content: Whether to include full content.\n category: Optional category filter ('code' or 'doc').\n\n Returns:\n List of SearchResult objects.\n \"\"\"\n if not chunk_ids:\n return []\n\n # Build parameterized query for IN clause\n placeholders = \",\".join(\"?\" * len(chunk_ids))\n _validate_sql_placeholders(placeholders, len(chunk_ids))\n\n # SQL injection prevention:\n # - Only a validated placeholders string (commas + '?') is interpolated into the query.\n # - User-provided values are passed separately via sqlite3 parameters.\n # - Category filter is added as a separate parameter\n if category:\n query = \"\"\"\n SELECT id, file_path, content, metadata\n FROM semantic_chunks\n WHERE id IN ({placeholders}) AND category = ?\n \"\"\".format(placeholders=placeholders)\n params = list(chunk_ids) + [category]\n else:\n query = \"\"\"\n SELECT id, file_path, content, metadata\n FROM semantic_chunks\n WHERE id IN ({placeholders})\n \"\"\".format(placeholders=placeholders)\n params = chunk_ids\n\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\"PRAGMA mmap_size = 30000000000\")\n rows = conn.execute(query, params).fetchall()\n\n # Build ID -> row mapping\n id_to_row = {r[0]: r for r in rows}\n\n results = []\n for chunk_id, score in zip(chunk_ids, scores):\n row = id_to_row.get(chunk_id)\n if not row:\n continue\n\n _, file_path, content, metadata_json = row\n metadata = json.loads(metadata_json) if metadata_json else {}\n\n # Build excerpt (short preview)\n excerpt = content[:200] + \"...\" if len(content) > 200 else content\n\n # Extract symbol information from metadata\n symbol_name = metadata.get(\"symbol_name\")\n symbol_kind = metadata.get(\"symbol_kind\")\n start_line = metadata.get(\"start_line\")\n end_line = metadata.get(\"end_line\")\n\n # Build Symbol object if we have symbol info\n symbol = None\n if symbol_name and symbol_kind and start_line and end_line:\n try:\n from codexlens.entities import Symbol\n symbol = Symbol(\n name=symbol_name,\n kind=symbol_kind,\n range=(start_line, end_line)\n )\n except Exception:\n pass\n\n results.append(SearchResult(\n path=file_path,\n score=score,\n excerpt=excerpt,\n content=content if return_full_content else None,\n symbol=symbol,\n metadata=metadata,\n start_line=start_line,\n end_line=end_line,\n symbol_name=symbol_name,\n symbol_kind=symbol_kind,\n ))\n\n return results\n\n def count_chunks(self) -> int:\n \"\"\"Count total chunks in store.\"\"\"\n with sqlite3.connect(self.db_path) as conn:\n row = conn.execute(\"SELECT COUNT(*) FROM semantic_chunks\").fetchone()\n return row[0] if row else 0\n\n def get_all_chunks(self) -> List[SemanticChunk]:\n \"\"\"Get all chunks from the store.\n\n Returns:\n List of SemanticChunk objects with id and content.\n \"\"\"\n with sqlite3.connect(self.db_path) as conn:\n conn.row_factory = sqlite3.Row\n rows = conn.execute(\n \"SELECT id, file_path, content, metadata FROM semantic_chunks\"\n ).fetchall()\n\n chunks = []\n for row in rows:\n chunks.append(SemanticChunk(\n id=row[\"id\"],\n content=row[\"content\"],\n file_path=row[\"file_path\"],\n metadata=json.loads(row[\"metadata\"]) if row[\"metadata\"] else None,\n ))\n return chunks\n\n def clear_cache(self) -> None:\n \"\"\"Manually clear the embedding cache.\"\"\"\n self._invalidate_cache()\n\n @property\n def ann_available(self) -> bool:\n \"\"\"Check if ANN index is available and ready.\"\"\"\n return (\n HNSWLIB_AVAILABLE\n and self._ann_index is not None\n and self._ann_index.is_loaded\n )\n\n @property\n def ann_count(self) -> int:\n \"\"\"Get number of vectors in ANN index.\"\"\"\n if self._ann_index is not None:\n return self._ann_index.count()\n return 0\n\n def get_model_config(self) -> Optional[Dict[str, Any]]:\n \"\"\"Get the model configuration used for embeddings in this store.\n\n Returns:\n Dictionary with model_profile, model_name, embedding_dim, backend, or None if not set.\n \"\"\"\n with sqlite3.connect(self.db_path) as conn:\n row = conn.execute(\n \"SELECT model_profile, model_name, embedding_dim, backend, created_at, updated_at \"\n \"FROM embeddings_config WHERE id = 1\"\n ).fetchone()\n if row:\n return {\n \"model_profile\": row[0],\n \"model_name\": row[1],\n \"embedding_dim\": row[2],\n \"backend\": row[3],\n \"created_at\": row[4],\n \"updated_at\": row[5],\n }\n return None\n\n def set_model_config(\n self, model_profile: str, model_name: str, embedding_dim: int, backend: str = 'fastembed'\n ) -> None:\n \"\"\"Set the model configuration for embeddings in this store.\n\n This should be called when generating new embeddings. If a different\n model was previously used, this will update the configuration.\n\n Args:\n model_profile: Model profile name (fast, code, minilm, etc.)\n model_name: Full model name (e.g., jinaai/jina-embeddings-v2-base-code)\n embedding_dim: Embedding dimension (e.g., 768)\n backend: Backend used for embeddings (fastembed or litellm, default: fastembed)\n \"\"\"\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\n \"\"\"\n INSERT INTO embeddings_config (id, model_profile, model_name, embedding_dim, backend)\n VALUES (1, ?, ?, ?, ?)\n ON CONFLICT(id) DO UPDATE SET\n model_profile = excluded.model_profile,\n model_name = excluded.model_name,\n embedding_dim = excluded.embedding_dim,\n backend = excluded.backend,\n updated_at = CURRENT_TIMESTAMP\n \"\"\",\n (model_profile, model_name, embedding_dim, backend)\n )\n conn.commit()\n\n def check_model_compatibility(\n self, model_profile: str, model_name: str, embedding_dim: int\n ) -> Tuple[bool, Optional[str]]:\n \"\"\"Check if the given model is compatible with existing embeddings.\n\n Args:\n model_profile: Model profile to check\n model_name: Model name to check\n embedding_dim: Embedding dimension to check\n\n Returns:\n Tuple of (is_compatible, warning_message).\n is_compatible is True if no existing config or configs match.\n warning_message is a user-friendly message if incompatible.\n \"\"\"\n existing = self.get_model_config()\n if existing is None:\n return True, None\n\n # Check dimension first (most critical)\n if existing[\"embedding_dim\"] != embedding_dim:\n return False, (\n f\"Dimension mismatch: existing embeddings use {existing['embedding_dim']}d \"\n f\"({existing['model_profile']}), but requested model uses {embedding_dim}d \"\n f\"({model_profile}). Use --force to regenerate all embeddings.\"\n )\n\n # Check model (different models with same dimension may have different semantic spaces)\n if existing[\"model_profile\"] != model_profile:\n return False, (\n f\"Model mismatch: existing embeddings use '{existing['model_profile']}' \"\n f\"({existing['model_name']}), but requested '{model_profile}' \"\n f\"({model_name}). Use --force to regenerate all embeddings.\"\n )\n\n return True, None\n\n def close(self) -> None:\n \"\"\"Close the vector store and release resources.\n\n This ensures SQLite connections are closed and ANN index is cleared,\n allowing temporary files to be deleted on Windows.\n \"\"\"\n with self._cache_lock:\n self._embedding_matrix = None\n self._embedding_norms = None\n self._chunk_ids = None\n\n with self._ann_write_lock:\n self._ann_index = None\n\n def __enter__(self) -> \"VectorStore\":\n \"\"\"Context manager entry.\"\"\"\n return self\n\n def __exit__(self, exc_type, exc_val, exc_tb) -> None:\n \"\"\"Context manager exit - close resources.\"\"\"\n self.close()", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py", + "score": 0.02356190140431283, + "excerpt": "\"\"\"Base classes for clustering strategies in the hybrid search pipeline.\n\nThis module defines the abstract base class for clustering strategies used", + "content": "\"\"\"Base classes for clustering strategies in the hybrid search pipeline.\n\nThis module defines the abstract base class for clustering strategies used\nin the staged hybrid search pipeline. Strategies cluster search results\nbased on their embeddings and select representative results from each cluster.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom dataclasses import dataclass, field", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", + "score": 0.022717150737751757, + "excerpt": "\nimport re\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Protocol", + "content": "\"\"\"Parser factory for CodexLens.\n\nPython and JavaScript/TypeScript parsing use Tree-Sitter grammars when\navailable. Regex fallbacks are retained to preserve the existing parser\ninterface and behavior in minimal environments.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport re\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Protocol\n\nfrom codexlens.config import Config\nfrom codexlens.entities import CodeRelationship, IndexedFile, RelationshipType, Symbol\nfrom codexlens.parsers.treesitter_parser import TreeSitterSymbolParser\n\n\nclass Parser(Protocol):\n def parse(self, text: str, path: Path) -> IndexedFile: ...", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py", + "score": 0.022282698690396483, + "excerpt": "\"\"\"codex-lens LSP Server implementation using pygls.\n\nThis module provides the main Language Server class and entry point.\n\"\"\"\n", + "content": "\"\"\"codex-lens LSP Server implementation using pygls.\n\nThis module provides the main Language Server class and entry point.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport argparse\nimport logging\nimport sys\nfrom pathlib import Path\nfrom typing import Optional\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", + "score": 0.022258499170812605, + "excerpt": " logger.log(level, \"[TIMING] %s: %.2fms\", name, elapsed_ms)\n\nfrom codexlens.config import Config\nfrom codexlens.config import VECTORS_HNSW_NAME\nfrom codexlens.entities import SearchResult", + "content": " logger: Logger instance to use\n level: Logging level (default DEBUG)\n \"\"\"\n start = time.perf_counter()\n try:\n yield\n finally:\n elapsed_ms = (time.perf_counter() - start) * 1000\n logger.log(level, \"[TIMING] %s: %.2fms\", name, elapsed_ms)\n\nfrom codexlens.config import Config\nfrom codexlens.config import VECTORS_HNSW_NAME\nfrom codexlens.entities import SearchResult\nfrom codexlens.search.ranking import (\n DEFAULT_WEIGHTS,\n FTS_FALLBACK_WEIGHTS,\n QueryIntent,\n apply_symbol_boost,\n cross_encoder_rerank,\n detect_query_intent,\n filter_results_by_category,", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", + "score": 0.022204010428648113, + "excerpt": "import threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple", + "content": "\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport hashlib\nimport re\nimport sqlite3\nimport threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple\n\nfrom codexlens.config import Config\nfrom codexlens.entities import CodeRelationship, SearchResult, Symbol\nfrom codexlens.errors import StorageError\nfrom codexlens.storage.global_index import GlobalSymbolIndex\n\n\n@dataclass", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", + "score": 0.022191896701700627, + "excerpt": "from typing import Callable, List, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore", + "content": "from __future__ import annotations\n\nimport json\nimport logging\nimport signal\nimport threading\nimport time\nfrom pathlib import Path\nfrom typing import Callable, List, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore\n\nfrom .events import FileEvent, IndexResult, PendingQueueStatus, WatcherConfig, WatcherStats\nfrom .file_watcher import FileWatcher\nfrom .incremental_indexer import IncrementalIndexer\n\nlogger = logging.getLogger(__name__)\n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", + "score": 0.021943278996721462, + "excerpt": "\nThis module provides the semantic_search() function for combining\nvector, structural, and keyword search with configurable fusion strategies.\n\"\"\"\n", + "content": "\"\"\"Semantic search API with RRF fusion.\n\nThis module provides the semantic_search() function for combining\nvector, structural, and keyword search with configurable fusion strategies.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import List, Optional\n\nfrom .models import SemanticResult\nfrom .utils import resolve_project", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py", + "score": 0.021943278996721462, + "excerpt": "from watchdog.events import FileSystemEventHandler\n\nfrom .events import ChangeType, FileEvent, WatcherConfig, PendingQueueStatus\nfrom ..config import Config\n", + "content": "\nimport logging\nimport threading\nimport time\nfrom pathlib import Path\nfrom typing import Callable, Dict, List, Optional\n\nfrom watchdog.observers import Observer\nfrom watchdog.events import FileSystemEventHandler\n\nfrom .events import ChangeType, FileEvent, WatcherConfig, PendingQueueStatus\nfrom ..config import Config\n\nlogger = logging.getLogger(__name__)\n\n# Maximum queue size to prevent unbounded memory growth\n# When exceeded, forces immediate flush to avoid memory exhaustion\nMAX_QUEUE_SIZE = 50000\n\n\nclass _CodexLensHandler(FileSystemEventHandler):", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py", + "score": 0.02150910700179165, + "excerpt": "\n \n env_api_base = _get_env_with_fallback(\"RERANKER_API_BASE\", self._workspace_root)\n self.api_base = (api_base or env_api_base or defaults[\"api_base\"]).strip().rstrip(\"/\")\n ...", + "content": "\n \n env_api_base = _get_env_with_fallback(\"RERANKER_API_BASE\", self._workspace_root)\n self.api_base = (api_base or env_api_base or defaults[\"api_base\"]).strip().rstrip(\"/\")\n self.endpoint = defaults[\"endpoint\"]\n\n \n env_model = _get_env_with_fallback(\"RERANKER_MODEL\", self._workspace_root)\n self.model_name = (model_name or env_model or defaults[\"default_model\"]).strip()\n if not self.model_name:\n raise ValueError(\"model_name cannot be blank\")\n\n \n resolved_key = api_key or _get_env_with_fallback(env_api_key, self._workspace_root) or \"\"\n resolved_key = resolved_key.strip()\n if not resolved_key:\n raise ValueError(\n f\"Missing API key for reranker provider '{self.provider}'. \"\n f\"Pass api_key=... or set ${env_api_key}.\"\n )\n self._api_key = resolved_key\n\n self.timeout_s = float(timeout) if timeout and float(timeout) > 0 else 30.0\n self.max_retries = int(max_retries) if max_retries and int(max_retries) >= 0 else 3\n self.backoff_base_s = float(backoff_base_s) if backoff_base_s and float(backoff_base_s) > 0 else 0.5\n self.backoff_max_s = float(backoff_max_s) if backoff_max_s and float(backoff_max_s) > 0 else 8.0\n\n headers = {\n \"Authorization\": f\"Bearer {self._api_key}\",\n \"Content-Type\": \"application/json\",\n }\n if self.provider == \"cohere\":\n headers.setdefault(\"Cohere-Version\", \"2022-12-06\")\n\n self._client = httpx.Client(\n base_url=self.api_base,\n headers=headers,\n timeout=self.timeout_s,\n )\n\n \n if max_input_tokens is not None:\n self._max_input_tokens = max_input_tokens\n else:\n \n model_lower = self.model_name.lower()\n if '8b' in model_lower or 'large' in model_lower:\n self._max_input_tokens = 32768\n else:\n self._max_input_tokens = 8192\n\n @property\n def max_input_tokens(self) -> int:\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", + "score": 0.02051605801605802, + "excerpt": " locked_config = get_locked_model_config()\n\n if locked_config is None:\n return {\n \"is_locked\": False,\n \"has_conflict\": False,\n \"locked_config\": None,\n ...", + "content": " locked_config = get_locked_model_config()\n\n if locked_config is None:\n return {\n \"is_locked\": False,\n \"has_conflict\": False,\n \"locked_config\": None,\n \"target_config\": {\"backend\": target_backend, \"model\": target_model},\n }\n\n has_conflict = (\n locked_config[\"backend\"] != target_backend or\n locked_config[\"model\"] != target_model\n )\n\n return {\n \"is_locked\": True,\n \"has_conflict\": has_conflict,\n \"locked_config\": locked_config,\n \"target_config\": {\"backend\": target_backend, \"model\": target_model},\n }", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\mcp\\provider.py", + "score": 0.020229904287875303, + "excerpt": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Optional, List, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import (\n MCPContext,\n SymbolInfo,\n Re...", + "content": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Optional, List, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import (\n MCPContext,\n SymbolInfo,\n ReferenceInfo,\n RelatedSymbol,\n)\n\nif TYPE_CHECKING:\n from codexlens.storage.global_index import GlobalSymbolIndex\n from codexlens.storage.registry import RegistryStore\n from codexlens.search.chain_search import ChainSearchEngine\n\nlogger = logging.getLogger(__name__)\n\n\nclass MCPProvider:\n\n def __init__(\n self,\n global_index: \"GlobalSymbolIndex\",\n search_engine: \"ChainSearchEngine\",\n registry: \"RegistryStore\",\n ) -> None:\n self.global_index = global_index\n self.search_engine = search_engine\n self.registry = registry\n\n def build_context(\n self,\n symbol_name: str,\n context_type: str = \"symbol_explanation\",\n include_references: bool = True,\n include_related: bool = True,\n max_references: int = 10,\n ) -> Optional[MCPContext]:\n \n symbols = self.global_index.search(symbol_name, prefix_mode=False, limit=1)\n\n if not symbols:\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\mcp\\hooks.py", + "score": 0.020007053720837744, + "excerpt": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Any, Dict, Optional, Callable, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import MCPContext\n\nif TYPE_CHECK...", + "content": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Any, Dict, Optional, Callable, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import MCPContext\n\nif TYPE_CHECKING:\n from codexlens.mcp.provider import MCPProvider\n\nlogger = logging.getLogger(__name__)\n\n\nclass HookManager:\n\n def __init__(self, mcp_provider: \"MCPProvider\") -> None:\n self.mcp_provider = mcp_provider\n self._pre_hooks: Dict[str, Callable] = {}\n self._post_hooks: Dict[str, Callable] = {}\n\n \n self._register_default_hooks()\n\n def _register_default_hooks(self) -> None:\n self._pre_hooks[\"explain\"] = self._pre_explain_hook\n self._pre_hooks[\"refactor\"] = self._pre_refactor_hook\n self._pre_hooks[\"document\"] = self._pre_document_hook\n\n def execute_pre_hook(\n self,\n action: str,\n params: Dict[str, Any],\n ) -> Optional[MCPContext]:\n hook = self._pre_hooks.get(action)\n\n if not hook:\n logger.debug(f\"No pre-hook for action: {action}\")\n return None\n\n try:\n return hook(params)\n except Exception as e:\n logger.error(f\"Pre-hook failed for {action}: {e}\")\n return None\n\n def execute_post_hook(\n self,\n action: str,\n result: Any,\n ) -> None:\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\association_tree\\deduplicator.py", + "score": 0.019921615989390927, + "excerpt": "\nfrom __future__ import annotations\n\nimport logging\nfrom typing import Dict, List, Optional\n\nfrom .data_structures import (\n CallTree,\n TreeNode,\n UniqueNode,\n)\n\nlogger = logging.getLogger(__...", + "content": "\nfrom __future__ import annotations\n\nimport logging\nfrom typing import Dict, List, Optional\n\nfrom .data_structures import (\n CallTree,\n TreeNode,\n UniqueNode,\n)\n\nlogger = logging.getLogger(__name__)\n\n\n\nKIND_WEIGHTS: Dict[str, float] = {\n \n \"function\": 1.0,\n \"method\": 1.0,\n \"12\": 1.0, \n \"6\": 1.0, \n \n \"class\": 0.8,\n \"5\": 0.8, \n \n \"interface\": 0.7,\n \"11\": 0.7, \n \"type\": 0.6,\n \n \"constructor\": 0.9,\n \"9\": 0.9, \n \n \"variable\": 0.4,\n \"13\": 0.4, \n \"constant\": 0.5,\n \"14\": 0.5, \n \n \"unknown\": 0.3,\n}\n\n\nclass ResultDeduplicator:\n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\factory.py", + "score": 0.01962803701934137, + "excerpt": " if not ok:\n raise ImportError(err)\n\n from .api_reranker import APIReranker\n\n _ = device \n resolved_model_name = (model_name or \"\").strip() or None\n retu...", + "content": " if not ok:\n raise ImportError(err)\n\n from .api_reranker import APIReranker\n\n _ = device \n resolved_model_name = (model_name or \"\").strip() or None\n return APIReranker(model_name=resolved_model_name, **kwargs)\n\n raise ValueError(\n f\"Unknown backend: {backend}. Supported backends: 'fastembed', 'onnx', 'api', 'litellm', 'legacy'\"\n )\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", + "score": 0.015740967294674172, + "excerpt": "import time\nfrom concurrent.futures import ProcessPoolExecutor, as_completed\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set, Tuple", + "content": "\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport os\nimport re\nimport sqlite3\nimport time\nfrom concurrent.futures import ProcessPoolExecutor, as_completed\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set, Tuple\n\nfrom codexlens.config import Config\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.storage.global_index import GlobalSymbolIndex\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import ProjectInfo, RegistryStore\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", + "score": 0.01569458021070924, + "excerpt": "\nLightweight Mode:\n The ChunkConfig supports a `skip_token_count` option for performance optimization.\n When enabled, token counting uses a fast character-based estimation (char/4)\n instead o...", + "content": "\"\"\"Code chunking strategies for semantic search.\n\nThis module provides various chunking strategies for breaking down source code\ninto semantic chunks suitable for embedding and search.\n\nLightweight Mode:\n The ChunkConfig supports a `skip_token_count` option for performance optimization.\n When enabled, token counting uses a fast character-based estimation (char/4)\n instead of expensive tiktoken encoding.\n\n Use cases for lightweight mode:\n - Large-scale indexing where speed is critical\n - Scenarios where approximate token counts are acceptable\n - Memory-constrained environments\n - Initial prototyping and development\n\n Example:", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py", + "score": 0.015496521189120809, + "excerpt": "from typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:", + "content": "\"\"\"No-op clustering strategy for search results.\n\nNoOpStrategy returns all results ungrouped when clustering dependencies\nare not available or clustering is disabled.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\nclass NoOpStrategy(BaseClusteringStrategy):\n \"\"\"No-op clustering strategy that returns all results ungrouped.\n\n This strategy is used as a final fallback when no clustering dependencies", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py", + "score": 0.014896214896214899, + "excerpt": "from typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:", + "content": "\"\"\"DBSCAN-based clustering strategy for search results.\n\nDBSCAN (Density-Based Spatial Clustering of Applications with Noise)\nis the fallback clustering strategy when HDBSCAN is not available.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\nclass DBSCANStrategy(BaseClusteringStrategy):\n \"\"\"DBSCAN-based clustering strategy.\n\n Uses sklearn's DBSCAN algorithm as a fallback when HDBSCAN is not available.", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\mcp\\schema.py", + "score": 0.014112903225806453, + "excerpt": " definition: Optional[str] = None\n references: List[ReferenceInfo] = field(default_factory=list)\n related_symbols: List[RelatedSymbol] = field(default_factory=list)\n metadata: dict = field...", + "content": " definition: Optional[str] = None\n references: List[ReferenceInfo] = field(default_factory=list)\n related_symbols: List[RelatedSymbol] = field(default_factory=list)\n metadata: dict = field(default_factory=dict)\n\n def to_dict(self) -> dict:\n result = {\n \"version\": self.version,\n \"context_type\": self.context_type,\n \"metadata\": self.metadata,\n }\n\n if self.symbol:\n result[\"symbol\"] = self.symbol.to_dict()\n if self.definition:\n result[\"definition\"] = self.definition\n if self.references:\n result[\"references\"] = [r.to_dict() for r in self.references]\n if self.related_symbols:\n result[\"related_symbols\"] = [s.to_dict() for s in self.related_symbols]\n\n return result\n\n def to_json(self, indent: int = 2) -> str:\n return json.dumps(self.to_dict(), indent=indent)\n\n def to_prompt_injection(self) -> str:\n parts = [\"\"]\n\n if self.symbol:\n parts.append(f\"## Symbol: {self.symbol.name}\")\n parts.append(f\"Type: {self.symbol.kind}\")\n parts.append(f\"Location: {self.symbol.file_path}:{self.symbol.line_start}\")\n\n if self.definition:\n parts.append(\"\\n## Definition\")\n parts.append(f\"```\\n{self.definition}\\n```\")\n\n if self.references:\n parts.append(f\"\\n## References ({len(self.references)} found)\")\n for ref in self.references[:5]: \n parts.append(f\"- {ref.file_path}:{ref.line} ({ref.relationship_type})\")\n parts.append(f\" ```\\n {ref.context}\\n ```\")\n\n if self.related_symbols:\n parts.append(\"\\n## Related Symbols\")\n for sym in self.related_symbols[:10]: \n parts.append(f\"- {sym.name} ({sym.relationship})\")\n\n parts.append(\"\")\n return \"\\n\".join(parts)\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", + "score": 0.013999118165784833, + "excerpt": "\nclass LspBridge:\n \n DEFAULT_BRIDGE_URL = \"http://127.0.0.1:3457\"\n DEFAULT_TIMEOUT = 30.0 \n DEFAULT_CACHE_TTL = 300 \n DEFAULT_MAX_CACHE_SIZE = 1000 \n\n def __init__(\n self,\n...", + "content": "\nclass LspBridge:\n \n DEFAULT_BRIDGE_URL = \"http://127.0.0.1:3457\"\n DEFAULT_TIMEOUT = 30.0 \n DEFAULT_CACHE_TTL = 300 \n DEFAULT_MAX_CACHE_SIZE = 1000 \n\n def __init__(\n self,\n bridge_url: str = DEFAULT_BRIDGE_URL,\n timeout: float = DEFAULT_TIMEOUT,\n cache_ttl: int = DEFAULT_CACHE_TTL,\n max_cache_size: int = DEFAULT_MAX_CACHE_SIZE,\n use_vscode_bridge: bool = False,\n workspace_root: Optional[str] = None,\n config_file: Optional[str] = None,\n ):\n self.bridge_url = bridge_url\n self.timeout = timeout\n self.cache_ttl = cache_ttl\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\events.py", + "score": 0.013999118165784833, + "excerpt": "\nfrom __future__ import annotations\n\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom pathlib import Path\nfrom typing import List, Optional, Set\n\n\nclass ChangeType(Enum)...", + "content": "\nfrom __future__ import annotations\n\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom pathlib import Path\nfrom typing import List, Optional, Set\n\n\nclass ChangeType(Enum):\n CREATED = \"created\"\n MODIFIED = \"modified\"\n DELETED = \"deleted\"\n MOVED = \"moved\"\n\n\n@dataclass\nclass FileEvent:\n path: Path\n change_type: ChangeType\n timestamp: float\n old_path: Optional[Path] = None \n\n\n@dataclass\nclass WatcherConfig:\n debounce_ms: int = 60000 \n ignored_patterns: Set[str] = field(default_factory=lambda: {\n \n \".git\", \".svn\", \".hg\",\n \n \".venv\", \"venv\", \"env\", \"__pycache__\", \".pytest_cache\", \".mypy_cache\", \".ruff_cache\",\n \n \"node_modules\", \"bower_components\", \".npm\", \".yarn\",\n \n \"dist\", \"build\", \"out\", \"target\", \"bin\", \"obj\", \"_build\", \"coverage\", \"htmlcov\",\n \n \".idea\", \".vscode\", \".vs\", \".eclipse\",\n \n \".codexlens\",\n \n \".cache\", \".parcel-cache\", \".turbo\", \".next\", \".nuxt\",\n \n \"logs\", \"tmp\", \"temp\",\n })\n languages: Optional[List[str]] = None \n\n\n@dataclass\nclass PendingQueueStatus:\n file_count: int = 0\n files: List[str] = field(default_factory=list) \n countdown_seconds: int = 0\n last_event_time: Optional[float] = None\n\n\n@dataclass\nclass IndexResult:\n files_indexed: int = 0\n files_removed: int = 0\n symbols_added: int = 0\n symbols_removed: int = 0\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", + "score": 0.013902465515368743, + "excerpt": "\nfrom __future__ import annotations\n\nimport platform\nimport sqlite3\nimport threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional...", + "content": "\nfrom __future__ import annotations\n\nimport platform\nimport sqlite3\nimport threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional\n\nfrom codexlens.errors import StorageError\n\n\n@dataclass\nclass ProjectInfo:\n\n id: int\n source_root: Path\n index_root: Path\n created_at: float\n last_indexed: float\n total_files: int\n total_dirs: int\n status: str\n\n\n@dataclass\nclass DirMapping:\n\n id: int\n project_id: int\n source_path: Path\n index_path: Path\n depth: int\n files_count: int\n last_updated: float\n\n\nclass RegistryStore:\n\n DEFAULT_DB_PATH = Path.home() / \".codexlens\" / \"registry.db\"\n\n def __init__(self, db_path: Path | None = None) -> None:\n self.db_path = (db_path or self.DEFAULT_DB_PATH).resolve()\n self._lock = threading.RLock()\n self._local = threading.local()\n self._pool_lock = threading.Lock()\n self._pool: Dict[int, sqlite3.Connection] = {}\n self._pool_generation = 0\n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_001_normalize_keywords.py", + "score": 0.013678451178451179, + "excerpt": " PRIMARY KEY (file_id, keyword_id),\n FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,\n FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCAD...", + "content": " PRIMARY KEY (file_id, keyword_id),\n FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,\n FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE\n )\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", + "score": 0.013661202185792351, + "excerpt": " project_root: Project root directory path\n symbol_name: Name of the symbol to find references for\n symbol_kind: Optional symbol kind filter (e.g., 'function', 'class')\n in...", + "content": "def find_references(\n project_root: str,\n symbol_name: str,\n symbol_kind: Optional[str] = None,\n include_definition: bool = True,\n group_by_definition: bool = True,\n limit: int = 100,\n) -> List[GroupedReferences]:\n \"\"\"Find all reference locations for a symbol.\n\n Multi-definition case returns grouped results to resolve ambiguity.\n\n This function wraps ChainSearchEngine.search_references() and groups\n the results by definition location. Each GroupedReferences contains\n a definition and all references that point to it.\n\n Args:\n project_root: Project root directory path\n symbol_name: Name of the symbol to find references for\n symbol_kind: Optional symbol kind filter (e.g., 'function', 'class')\n include_definition: Whether to include the definition location\n in the result (default True)\n group_by_definition: Whether to group references by definition.\n If False, returns a single group with all references.\n (default True)\n limit: Maximum number of references to return (default 100)\n\n Returns:\n List of GroupedReferences. Each group contains:\n - definition: The DefinitionResult for this symbol definition\n - references: List of ReferenceResult pointing to this definition\n\n Raises:\n ValueError: If project_root does not exist or is not a directory\n\n Examples:\n >>> refs = find_references(\"/path/to/project\", \"authenticate\")\n >>> for group in refs:\n ... print(f\"Definition: {group.definition.file_path}:{group.definition.line}\")\n ... for ref in group.references:\n ... print(f\" Reference: {ref.file_path}:{ref.line} ({ref.relationship})\")\n\n Note:\n Reference relationship types are normalized:\n - 'calls' -> 'call'\n - 'imports' -> 'import'\n - 'inherits' -> 'inheritance'\n \"\"\"\n # Validate and resolve project root\n project_path = resolve_project(project_root)\n\n # Import here to avoid circular imports\n from codexlens.config import Config\n from codexlens.storage.registry import RegistryStore\n from codexlens.storage.path_mapper import PathMapper\n from codexlens.storage.global_index import GlobalSymbolIndex\n from codexlens.search.chain_search import ChainSearchEngine\n from codexlens.search.chain_search import ReferenceResult as RawReferenceResult\n from codexlens.entities import Symbol\n\n # Initialize infrastructure\n config = Config()\n registry = RegistryStore()\n mapper = PathMapper(config.index_dir)\n\n # Create chain search engine\n engine = ChainSearchEngine(registry, mapper, config=config)\n\n try:\n # Step 1: Find definitions for the symbol\n definitions: List[DefinitionResult] = []\n\n if include_definition or group_by_definition:\n # Search for symbol definitions\n symbols = engine.search_symbols(\n name=symbol_name,\n source_path=project_path,\n kind=symbol_kind,\n )\n\n # Convert Symbol to DefinitionResult\n for sym in symbols:\n # Only include exact name matches for definitions\n if sym.name != symbol_name:\n continue\n\n # Optionally filter by kind\n if symbol_kind and sym.kind != symbol_kind:\n continue\n\n definitions.append(DefinitionResult(\n name=sym.name,\n kind=sym.kind,\n file_path=sym.file or \"\",\n line=sym.range[0] if sym.range else 1,\n end_line=sym.range[1] if sym.range else 1,\n signature=None, # Not available from Symbol\n container=None, # Not available from Symbol\n score=1.0,\n ))\n\n # Step 2: Get all references using ChainSearchEngine\n raw_references = engine.search_references(\n symbol_name=symbol_name,\n source_path=project_path,\n depth=-1,\n limit=limit,\n )\n\n # Step 3: Transform raw references to API ReferenceResult\n api_references: List[ReferenceResult] = []\n for raw_ref in raw_references:\n api_ref = _transform_to_reference_result(raw_ref)\n api_references.append(api_ref)\n\n # Step 4: Group references by definition\n if group_by_definition and definitions:\n return _group_references_by_definition(\n definitions=definitions,\n references=api_references,\n include_definition=include_definition,\n )\n else:\n # Return single group with placeholder definition or first definition\n if definitions:\n definition = definitions[0]\n else:\n # Create placeholder definition when no definition found\n definition = DefinitionResult(\n name=symbol_name,\n kind=symbol_kind or \"unknown\",\n file_path=\"\",\n line=0,\n end_line=0,\n signature=None,\n container=None,\n score=0.0,\n )\n\n return [GroupedReferences(\n definition=definition,\n references=api_references,\n )]\n\n finally:\n engine.close()", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\env_config.py", + "score": 0.01359062143375869, + "excerpt": " \n return default\n\n\ndef get_api_config(\n prefix: str,\n *,\n workspace_root: Path | None = None,\n defaults: Dict[str, Any] | None = None,\n) -> Dict[str, Any]:\n defaults = defaults o...", + "content": " \n return default\n\n\ndef get_api_config(\n prefix: str,\n *,\n workspace_root: Path | None = None,\n defaults: Dict[str, Any] | None = None,\n) -> Dict[str, Any]:\n defaults = defaults or {}\n \n config: Dict[str, Any] = {}\n \n \n field_mapping = {\n \"api_key\": f\"{prefix}_API_KEY\",\n \"api_base\": f\"{prefix}_API_BASE\",\n \"model\": f\"{prefix}_MODEL\",\n \"provider\": f\"{prefix}_PROVIDER\",\n \"timeout\": f\"{prefix}_TIMEOUT\",\n }\n \n for field, env_key in field_mapping.items():\n value = get_env(env_key, workspace_root=workspace_root)\n if value is not None:\n \n if field == \"timeout\":\n try:\n config[field] = float(value)\n except ValueError:\n pass\n else:\n config[field] = value\n elif field in defaults:\n config[field] = defaults[field]\n \n return config\n\n\ndef generate_env_example() -> str:\n lines = [\n \"# CodexLens Environment Configuration\",\n \"# Copy this file to .codexlens/.env and fill in your values\",\n \"\",\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\__init__.py", + "score": 0.01359062143375869, + "excerpt": " \"FileContextResult\",\n \"DefinitionResult\",\n \"ReferenceResult\",\n \"GroupedReferences\",\n \"SymbolInfo\",\n \"HoverInfo\",\n \"SemanticResult\",\n \n \"resolve_project\",\n \"normalize_rel...", + "content": " \"FileContextResult\",\n \"DefinitionResult\",\n \"ReferenceResult\",\n \"GroupedReferences\",\n \"SymbolInfo\",\n \"HoverInfo\",\n \"SemanticResult\",\n \n \"resolve_project\",\n \"normalize_relationship_type\",\n \"rank_by_proximity\",\n \"rank_by_score\",\n \n \"find_definition\",\n \"workspace_symbols\",\n \"get_hover\",\n \"file_context\",\n \"find_references\",\n \"semantic_search\",\n]\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_005_cleanup_unused_fields.py", + "score": 0.013517665130568358, + "excerpt": "\nimport logging\nfrom sqlite3 import Connection\n\nlog = logging.getLogger(__name__)\n\n\ndef upgrade(db_conn: Connection):\n cursor = db_conn.cursor()\n\n \n log.info(\"Checking semantic_metadata.keywo...", + "content": "\nimport logging\nfrom sqlite3 import Connection\n\nlog = logging.getLogger(__name__)\n\n\ndef upgrade(db_conn: Connection):\n cursor = db_conn.cursor()\n\n \n log.info(\"Checking semantic_metadata.keywords column...\")\n\n cursor.execute(\n \"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'\"\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py", + "score": 0.013495897553868569, + "excerpt": " >>> \n >>> from codexlens.search.clustering import FrequencyConfig\n >>> freq_config = FrequencyConfig(min_frequency=2, group_by=\"symbol\")\n >>> strategy = get_strategy(\"freq...", + "content": " >>> \n >>> from codexlens.search.clustering import FrequencyConfig\n >>> freq_config = FrequencyConfig(min_frequency=2, group_by=\"symbol\")\n >>> strategy = get_strategy(\"frequency\", freq_config)\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\onnx_reranker.py", + "score": 0.013480392156862746, + "excerpt": "\n from optimum.onnxruntime import ORTModelForSequenceClassification\n from transformers import AutoTokenizer\n\n if self.providers is None:\n from ..gpu_sup...", + "content": "\n from optimum.onnxruntime import ORTModelForSequenceClassification\n from transformers import AutoTokenizer\n\n if self.providers is None:\n from ..gpu_support import get_optimal_providers\n\n \n self.providers = get_optimal_providers(\n use_gpu=self.use_gpu, with_device_options=True\n )\n\n \n \n model_kwargs: dict[str, Any] = {}\n try:\n params = signature(ORTModelForSequenceClassification.from_pretrained).parameters\n if \"providers\" in params:\n model_kwargs[\"providers\"] = self.providers\n elif \"provider\" in params:\n provider_name = \"CPUExecutionProvider\"\n if self.providers:\n first = self.providers[0]\n provider_name = first[0] if isinstance(first, tuple) else str(first)\n model_kwargs[\"provider\"] = provider_name\n except Exception:\n model_kwargs = {}\n\n try:\n self._model = ORTModelForSequenceClassification.from_pretrained(\n self.model_name,\n **model_kwargs,\n )\n except TypeError:\n \n self._model = ORTModelForSequenceClassification.from_pretrained(self.model_name)\n\n self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)\n\n \n input_names: set[str] | None = None\n for attr in (\"input_names\", \"model_input_names\"):\n names = getattr(self._model, attr, None)\n if isinstance(names, (list, tuple)) and names:\n input_names = {str(n) for n in names}\n break\n if input_names is None:\n try:\n session = getattr(self._model, \"model\", None)\n if session is not None and hasattr(session, \"get_inputs\"):\n input_names = {i.name for i in session.get_inputs()}\n except Exception:\n input_names = None\n self._model_input_names = input_names\n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", + "score": 0.013403880070546739, + "excerpt": " return True\n\n except Exception as e:\n raise StorageError(f\"Failed to load ANN index: {e}\")\n\n def count(self) -> int:\n with self._lock:\n r...", + "content": " return True\n\n except Exception as e:\n raise StorageError(f\"Failed to load ANN index: {e}\")\n\n def count(self) -> int:\n with self._lock:\n return self._current_count\n\n @property\n def capacity(self) -> int:\n with self._lock:\n return self._max_elements\n\n @property\n def usage_ratio(self) -> float:\n with self._lock:\n if self._max_elements == 0:\n return 0.0\n return self._current_count / self._max_elements\n\n @property\n def is_loaded(self) -> bool:\n with self._lock:\n return self._index is not None and self._current_count > 0\n\n\n\nclass BinaryANNIndex:\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\litellm_embedder.py", + "score": 0.01322751322751323, + "excerpt": "class LiteLLMEmbedderWrapper(BaseEmbedder):\n \"\"\"Wrapper for ccw-litellm LiteLLMEmbedder.\n", + "content": "class LiteLLMEmbedderWrapper(BaseEmbedder):\n \"\"\"Wrapper for ccw-litellm LiteLLMEmbedder.\n\n This wrapper adapts the ccw-litellm LiteLLMEmbedder to the CodexLens\n BaseEmbedder interface, enabling seamless integration with CodexLens\n semantic search functionality.\n\n Args:\n model: Model identifier for LiteLLM (default: \"default\")\n **kwargs: Additional arguments passed to LiteLLMEmbedder\n\n Raises:\n ImportError: If ccw-litellm package is not installed\n \"\"\"\n\n def __init__(self, model: str = \"default\", **kwargs) -> None:\n \"\"\"Initialize LiteLLM embedder wrapper.\n\n Args:\n model: Model identifier for LiteLLM (default: \"default\")\n **kwargs: Additional arguments passed to LiteLLMEmbedder\n\n Raises:\n ImportError: If ccw-litellm package is not installed\n \"\"\"\n try:\n from ccw_litellm import LiteLLMEmbedder\n self._embedder = LiteLLMEmbedder(model=model, **kwargs)\n except ImportError as e:\n raise ImportError(\n \"ccw-litellm not installed. Install with: pip install ccw-litellm\"\n ) from e\n\n @property\n def embedding_dim(self) -> int:\n \"\"\"Return embedding dimensions from LiteLLMEmbedder.\n\n Returns:\n int: Dimension of the embedding vectors.\n \"\"\"\n return self._embedder.dimensions\n\n @property\n def model_name(self) -> str:\n \"\"\"Return model name from LiteLLMEmbedder.\n\n Returns:\n str: Name or identifier of the underlying model.\n \"\"\"\n return self._embedder.model_name\n\n @property\n def max_tokens(self) -> int:\n \"\"\"Return maximum token limit for the embedding model.\n\n Returns:\n int: Maximum number of tokens that can be embedded at once.\n Reads from LiteLLM config's max_input_tokens property.\n \"\"\"\n # Get from LiteLLM embedder's max_input_tokens property (now exposed)\n if hasattr(self._embedder, 'max_input_tokens'):\n return self._embedder.max_input_tokens\n\n # Fallback: infer from model name\n model_name_lower = self.model_name.lower()\n\n # Large models (8B or \"large\" in name)\n if '8b' in model_name_lower or 'large' in model_name_lower:\n return 32768\n\n # OpenAI text-embedding-3-* models\n if 'text-embedding-3' in model_name_lower:\n return 8191\n\n # Default fallback\n return 8192\n\n def _sanitize_text(self, text: str) -> str:\n \"\"\"Sanitize text to work around ModelScope API routing bug.\n\n ModelScope incorrectly routes text starting with lowercase 'import'\n to an Ollama endpoint, causing failures. This adds a leading space\n to work around the issue without affecting embedding quality.\n\n Args:\n text: Text to sanitize.\n\n Returns:\n Sanitized text safe for embedding API.\n \"\"\"\n if text.startswith('import'):\n return ' ' + text\n return text\n\n def embed_to_numpy(self, texts: str | Iterable[str], **kwargs) -> np.ndarray:\n \"\"\"Embed texts to numpy array using LiteLLMEmbedder.\n\n Args:\n texts: Single text or iterable of texts to embed.\n **kwargs: Additional arguments (ignored for LiteLLM backend).\n Accepts batch_size for API compatibility with fastembed.\n\n Returns:\n numpy.ndarray: Array of shape (n_texts, embedding_dim) containing embeddings.\n \"\"\"\n if isinstance(texts, str):\n texts = [texts]\n else:\n texts = list(texts)\n\n # Sanitize texts to avoid ModelScope routing bug\n texts = [self._sanitize_text(t) for t in texts]\n\n # LiteLLM handles batching internally, ignore batch_size parameter\n return self._embedder.embed(texts)\n\n def embed_single(self, text: str) -> list[float]:\n \"\"\"Generate embedding for a single text.\n\n Args:\n text: Text to embed.\n\n Returns:\n list[float]: Embedding vector as a list of floats.\n \"\"\"\n # Sanitize text before embedding\n sanitized = self._sanitize_text(text)\n embedding = self._embedder.embed([sanitized])\n return embedding[0].tolist()", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\association_tree\\builder.py", + "score": 0.013083213083213086, + "excerpt": "\nfrom __future__ import annotations\n\nimport asyncio\nimport logging\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set\n\nfrom codexlens.hybrid_search.data_structures import CallHierar...", + "content": "\nfrom __future__ import annotations\n\nimport asyncio\nimport logging\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set\n\nfrom codexlens.hybrid_search.data_structures import CallHierarchyItem, Range\nfrom codexlens.lsp.standalone_manager import StandaloneLspManager\nfrom .data_structures import CallTree, TreeNode\n\nlogger = logging.getLogger(__name__)\n\n\nclass AssociationTreeBuilder:\n\n def __init__(\n self,\n lsp_manager: StandaloneLspManager,\n timeout: float = 5.0,\n ):\n self.lsp_manager = lsp_manager\n self.timeout = timeout\n self.visited: Set[str] = set()\n\n async def build_tree(\n self,\n seed_file_path: str,\n seed_line: int,\n seed_character: int = 1,\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py", + "score": 0.012885154061624651, + "excerpt": "import re\nimport sqlite3\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple\n\ntry:\n from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser\nexcept Exception:...", + "content": "import re\nimport sqlite3\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple\n\ntry:\n from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser\nexcept Exception: \n TreeSitterSymbolParser = None \n\n\nclass SymbolExtractor:\n\n \n PATTERNS = {\n 'python': {\n 'function': r'^(?:async\\s+)?def\\s+(\\w+)\\s*\\(',\n 'class': r'^class\\s+(\\w+)\\s*[:\\(]',\n 'import': r'^(?:from\\s+([\\w.]+)\\s+)?import\\s+([\\w.,\\s]+)',\n 'call': r'(? Tuple[str, Optional[str], Optional[int], Optional[int]]:\n return (result.path, result.symbol_name, result.start_line, result.end_line)\n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py", + "score": 0.008680555555555556, + "excerpt": "from typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:", + "content": "\"\"\"HDBSCAN-based clustering strategy for search results.\n\nHDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise)\nis the primary clustering strategy for grouping similar search results.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\nclass HDBSCANStrategy(BaseClusteringStrategy):\n \"\"\"HDBSCAN-based clustering strategy.\n\n Uses HDBSCAN algorithm to cluster search results based on embedding similarity.", + "source": null, + "symbol": null + } + ], + "stats": { + "dirs_searched": 17, + "files_matched": 50, + "time_ms": 7219.313144683838 + } + } +} diff --git a/codex-lens/_tmp_verbose.json b/codex-lens/_tmp_verbose.json new file mode 100644 index 00000000..2593104d --- /dev/null +++ b/codex-lens/_tmp_verbose.json @@ -0,0 +1,95 @@ +{ + "success": true, + "result": { + "query": "class Config", + "method": "cascade", + "count": 10, + "results": [ + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\hybrid_search\\data_structures.py", + "score": 0.06081658330145309, + "excerpt": " @classmethod\n def from_dict(cls, data: Dict[str, Any]) -> \"CallHierarchyItem\":\n return cls(\n name=data[\"name\"],\n kind=data[\"kind\"],\n file_path=data[\"file...", + "content": " @classmethod\n def from_dict(cls, data: Dict[str, Any]) -> \"CallHierarchyItem\":\n return cls(\n name=data[\"name\"],\n kind=data[\"kind\"],\n file_path=data[\"file_path\"],\n range=Range.from_dict(data[\"range\"]),\n detail=data.get(\"detail\"),\n )\n\n\n@dataclass\nclass CodeSymbolNode:\n\n id: str\n name: str\n kind: str\n file_path: str\n range: Range\n embedding: Optional[List[float]] = None\n raw_code: str = \"\"\n docstring: str = \"\"\n score: float = 0.0\n\n def __post_init__(self) -> None:\n if not self.id:\n raise ValueError(\"id cannot be empty\")\n if not self.name:\n raise ValueError(\"name cannot be empty\")\n if not self.kind:\n raise ValueError(\"kind cannot be empty\")\n if not self.file_path:\n raise ValueError(\"file_path cannot be empty\")\n\n def __hash__(self) -> int:\n return hash(self.id)\n\n def __eq__(self, other: object) -> bool:\n if not isinstance(other, CodeSymbolNode):\n return False\n return self.id == other.id\n\n def to_dict(self) -> Dict[str, Any]:\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", + "score": 0.056576452190618645, + "excerpt": "from rich.table import Table\n\nfrom codexlens.config import Config\nfrom codexlens.entities import IndexedFile, SearchResult, Symbol\nfrom codexlens.errors import CodexLensError, ConfigError, ParseError,...", + "content": "import os\nimport shutil\nimport sqlite3\nfrom pathlib import Path\nfrom typing import Annotated, Any, Dict, Iterable, List, Optional\n\nimport typer\nfrom rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn\nfrom rich.table import Table\n\nfrom codexlens.config import Config\nfrom codexlens.entities import IndexedFile, SearchResult, Symbol\nfrom codexlens.errors import CodexLensError, ConfigError, ParseError, StorageError, SearchError\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore, ProjectInfo\nfrom codexlens.storage.index_tree import IndexTreeBuilder\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.search.chain_search import ChainSearchEngine, SearchOptions\nfrom codexlens.watcher import WatcherManager, WatcherConfig\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\config.py", + "score": 0.05655744432847353, + "excerpt": "\"\"\"Configuration system for CodexLens.\"\"\"\n\nfrom __future__ import annotations", + "content": "\"\"\"Configuration system for CodexLens.\"\"\"\n\nfrom __future__ import annotations\n\nimport json\nimport logging\nimport os\nfrom dataclasses import dataclass, field\nfrom functools import cached_property\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", + "score": 0.049219375000264694, + "excerpt": "\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CH...", + "content": "\"\"\"Chain search engine for recursive multi-directory searching.\n\nProvides parallel search across directory hierarchies using indexed _index.db files.\nSupports depth-limited traversal, result aggregation, and symbol search.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CHECKING\nimport json\nimport logging\nimport os\nimport time\n\nfrom codexlens.entities import SearchResult, Symbol\n\nif TYPE_CHECKING:", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\embedding.py", + "score": 0.047931429239828446, + "excerpt": " def __init__(\n self,\n model_name: Optional[str] = None,\n use_gpu: bool = True,\n expand_dim: bool = True,\n ) -> None:\n from codexlens.semantic import SEMANTIC_...", + "content": " def __init__(\n self,\n model_name: Optional[str] = None,\n use_gpu: bool = True,\n expand_dim: bool = True,\n ) -> None:\n from codexlens.semantic import SEMANTIC_AVAILABLE\n\n if not SEMANTIC_AVAILABLE:\n raise ImportError(\n \"Semantic search dependencies not available. \"\n \"Install with: pip install codexlens[semantic]\"\n )\n\n self._model_name = model_name or self.DEFAULT_MODEL\n self._use_gpu = use_gpu\n self._expand_dim = expand_dim\n self._model = None\n self._native_dim: Optional[int] = None\n\n \n self._expansion_matrix: Optional[np.ndarray] = None\n\n @property\n def model_name(self) -> str:\n return self._model_name\n\n @property\n def embedding_dim(self) -> int:\n if self._expand_dim:\n return self.TARGET_DIM\n \n if self._native_dim is not None:\n return self._native_dim\n \n model_dims = {\n \"BAAI/bge-large-en-v1.5\": 1024,\n \"BAAI/bge-base-en-v1.5\": 768,\n \"BAAI/bge-small-en-v1.5\": 384,\n \"intfloat/multilingual-e5-large\": 1024,\n }\n return model_dims.get(self._model_name, 1024)\n\n @property\n def max_tokens(self) -> int:\n return 512 \n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", + "score": 0.04283104206542711, + "excerpt": "import threading\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom typing import Any, Dict, Iterable, List, Optional", + "content": "Provides intelligent load balancing across multiple LiteLLM embedding endpoints\nto maximize throughput while respecting rate limits.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport random\nimport threading\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom typing import Any, Dict, Iterable, List, Optional\n\nimport numpy as np\n\nfrom .base import BaseEmbedder\n\nlogger = logging.getLogger(__name__)\n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", + "score": 0.036886112765573215, + "excerpt": "- Direct subprocess spawning of language servers\n- JSON-RPC 2.0 communication over stdin/stdout\n- Multi-language support via configuration file (lsp-servers.json)\n- Process lifecycle management with a...", + "content": "\"\"\"Standalone Language Server Manager for direct LSP communication.\n\nThis module provides direct communication with language servers via JSON-RPC over stdio,\neliminating the need for VSCode Bridge. Similar to cclsp architecture.\n\nFeatures:\n- Direct subprocess spawning of language servers\n- JSON-RPC 2.0 communication over stdin/stdout\n- Multi-language support via configuration file (lsp-servers.json)\n- Process lifecycle management with auto-restart\n- Compatible interface with existing LspBridge\n\"\"\"\n\nfrom __future__ import annotations\n\nimport asyncio\nimport json\nimport logging\nimport os", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\models.py", + "score": 0.03448209080810879, + "excerpt": " container: Containing class/module (if any)\n score: Match score for ranking\n return {k: v for k, v in asdict(self).items() if v is not None}\n\n\n# =================================...", + "content": " container: Containing class/module (if any)\n score: Match score for ranking\n return {k: v for k, v in asdict(self).items() if v is not None}\n\n\n# =============================================================================\n# Section 4.4: find_references dataclasses\n# =============================================================================\n\n@dataclass\nclass ReferenceResult:\n file_path: str\n line: int\n column: int\n context_line: str\n relationship: str # call | import | type_annotation | inheritance\n\n def to_dict(self) -> dict:\n return asdict(self)\n\n\n@dataclass\nclass GroupedReferences:\n definition: DefinitionResult\n references: List[ReferenceResult] = field(default_factory=list)\n\n def to_dict(self) -> dict:\n return {\n \"definition\": self.definition.to_dict(),\n \"references\": [r.to_dict() for r in self.references],\n }\n\n\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py", + "score": 0.03341093379138448, + "excerpt": "\n if TREE_SITTER_AVAILABLE:\n self._initialize_parser()\n\n def _initialize_parser(self) -> None:\n if TreeSitterParser is None or TreeSitterLanguage is None:\n retur...", + "content": "\n if TREE_SITTER_AVAILABLE:\n self._initialize_parser()\n\n def _initialize_parser(self) -> None:\n if TreeSitterParser is None or TreeSitterLanguage is None:\n return\n\n try:\n \n if self.language_id == \"python\":\n import tree_sitter_python\n self._language = TreeSitterLanguage(tree_sitter_python.language())\n elif self.language_id == \"javascript\":\n import tree_sitter_javascript\n self._language = TreeSitterLanguage(tree_sitter_javascript.language())\n elif self.language_id == \"typescript\":\n import tree_sitter_typescript\n \n if self.path is not None and self.path.suffix.lower() == \".tsx\":\n self._language = TreeSitterLanguage(tree_sitter_typescript.language_tsx())\n else:\n self._language = TreeSitterLanguage(tree_sitter_typescript.language_typescript())\n else:\n return\n\n \n self._parser = TreeSitterParser()\n if hasattr(self._parser, \"set_language\"):\n self._parser.set_language(self._language) \n else:\n self._parser.language = self._language \n\n except Exception:\n \n self._parser = None\n self._language = None\n\n def is_available(self) -> bool:\n return self._parser is not None and self._language is not None\n\n def _parse_tree(self, text: str) -> Optional[tuple[bytes, TreeSitterNode]]:\n if not self.is_available() or self._parser is None:\n", + "source": null, + "symbol": null + }, + { + "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py", + "score": 0.029568673189485736, + "excerpt": "\nimport logging\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import List, Optional", + "content": "\"\"\"Incremental indexer for processing file changes.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import List, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.storage.global_index import GlobalSymbolIndex\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore\n", + "source": null, + "symbol": null + } + ], + "stats": { + "dirs_searched": 17, + "files_matched": 10, + "time_ms": 6667.8361892700195 + } + } +} diff --git a/codex-lens/src/codexlens/cli/commands.py b/codex-lens/src/codexlens/cli/commands.py index 687043a5..50a4720d 100644 --- a/codex-lens/src/codexlens/cli/commands.py +++ b/codex-lens/src/codexlens/cli/commands.py @@ -455,6 +455,12 @@ def search( hidden=True, help="[Advanced] Cascade strategy for --method cascade." ), + staged_stage2_mode: Optional[str] = typer.Option( + None, + "--staged-stage2-mode", + hidden=True, + help="[Advanced] Stage 2 expansion mode for cascade strategy 'staged': precomputed | realtime.", + ), # Hidden deprecated parameter for backward compatibility mode: Optional[str] = typer.Option(None, "--mode", hidden=True, help="[DEPRECATED] Use --method instead."), json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), @@ -545,7 +551,7 @@ def search( # Validate cascade_strategy if provided (for advanced users) if internal_cascade_strategy is not None: - valid_strategies = ["binary", "hybrid", "binary_rerank", "dense_rerank"] + valid_strategies = ["binary", "hybrid", "binary_rerank", "dense_rerank", "staged"] if internal_cascade_strategy not in valid_strategies: if json_mode: print_json(success=False, error=f"Invalid cascade strategy: {internal_cascade_strategy}. Must be one of: {', '.join(valid_strategies)}") @@ -606,6 +612,18 @@ def search( engine = ChainSearchEngine(registry, mapper, config=config) + # Optional staged cascade overrides (only meaningful for cascade strategy 'staged') + if staged_stage2_mode is not None: + stage2 = staged_stage2_mode.strip().lower() + if stage2 not in {"precomputed", "realtime"}: + msg = "Invalid --staged-stage2-mode. Must be: precomputed | realtime." + if json_mode: + print_json(success=False, error=msg) + else: + console.print(f"[red]{msg}[/red]") + raise typer.Exit(code=1) + config.staged_stage2_mode = stage2 + # Map method to SearchOptions flags # fts: FTS-only search (optionally with fuzzy) # vector: Pure vector semantic search @@ -986,6 +1004,103 @@ def status( registry.close() +@app.command(name="lsp-status") +def lsp_status( + path: Path = typer.Option(Path("."), "--path", "-p", help="Workspace root for LSP probing."), + probe_file: Optional[Path] = typer.Option( + None, + "--probe-file", + help="Optional file path to probe (starts the matching language server and prints capabilities).", + ), + json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), + verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), +) -> None: + """Show standalone LSP configuration and optionally probe a language server. + + This exercises the existing LSP server selection/startup path in StandaloneLspManager. + """ + _configure_logging(verbose, json_mode) + + import asyncio + import shutil + + from codexlens.lsp.standalone_manager import StandaloneLspManager + + workspace_root = path.expanduser().resolve() + probe_path = probe_file.expanduser().resolve() if probe_file is not None else None + + async def _run(): + manager = StandaloneLspManager(workspace_root=str(workspace_root)) + await manager.start() + + servers = [] + for language_id, cfg in sorted(manager._configs.items()): # type: ignore[attr-defined] + cmd0 = cfg.command[0] if cfg.command else None + servers.append( + { + "language_id": language_id, + "display_name": cfg.display_name, + "extensions": list(cfg.extensions), + "command": list(cfg.command), + "command_available": bool(shutil.which(cmd0)) if cmd0 else False, + } + ) + + probe = None + if probe_path is not None: + state = await manager._get_server(str(probe_path)) + if state is None: + probe = { + "file": str(probe_path), + "ok": False, + "error": "No language server configured/available for this file.", + } + else: + probe = { + "file": str(probe_path), + "ok": True, + "language_id": state.config.language_id, + "display_name": state.config.display_name, + "initialized": bool(state.initialized), + "capabilities": state.capabilities, + } + + await manager.stop() + return {"workspace_root": str(workspace_root), "servers": servers, "probe": probe} + + try: + payload = asyncio.run(_run()) + except Exception as exc: + if json_mode: + print_json(success=False, error=f"LSP status failed: {exc}") + else: + console.print(f"[red]LSP status failed:[/red] {exc}") + raise typer.Exit(code=1) + + if json_mode: + print_json(success=True, result=payload) + return + + console.print("[bold]CodexLens LSP Status[/bold]") + console.print(f" Workspace: {payload['workspace_root']}") + console.print("\n[bold]Configured Servers:[/bold]") + for s in payload["servers"]: + ok = "✓" if s["command_available"] else "✗" + console.print(f" {ok} {s['display_name']} ({s['language_id']}) -> {s['command'][0] if s['command'] else ''}") + console.print(f" Extensions: {', '.join(s['extensions'])}") + + if payload["probe"] is not None: + probe = payload["probe"] + console.print("\n[bold]Probe:[/bold]") + if not probe.get("ok"): + console.print(f" ✗ {probe.get('file')}") + console.print(f" {probe.get('error')}") + else: + console.print(f" ✓ {probe.get('file')}") + console.print(f" Server: {probe.get('display_name')} ({probe.get('language_id')})") + console.print(f" Initialized: {probe.get('initialized')}") + + @app.command() def projects( action: str = typer.Argument("list", help="Action: list, show, remove"), @@ -3962,4 +4077,3 @@ def index_migrate_deprecated( json_mode=json_mode, verbose=verbose, ) - diff --git a/codex-lens/src/codexlens/config.py b/codex-lens/src/codexlens/config.py index b5012f64..c46deb04 100644 --- a/codex-lens/src/codexlens/config.py +++ b/codex-lens/src/codexlens/config.py @@ -145,6 +145,11 @@ class Config: # Staged cascade search configuration (4-stage pipeline) staged_coarse_k: int = 200 # Number of coarse candidates from Stage 1 binary search staged_lsp_depth: int = 2 # LSP relationship expansion depth in Stage 2 + staged_stage2_mode: str = "precomputed" # "precomputed" (graph_neighbors) | "realtime" (LSP) + staged_realtime_lsp_timeout_s: float = 10.0 # Max time budget for realtime LSP expansion + staged_realtime_lsp_max_nodes: int = 100 # Node cap for realtime graph expansion + staged_realtime_lsp_warmup_s: float = 2.0 # Wait for server analysis after opening seed docs + staged_realtime_lsp_resolve_symbols: bool = False # If True, resolves symbol names via documentSymbol (slower) staged_clustering_strategy: str = "auto" # "auto", "hdbscan", "dbscan", "frequency", "noop" staged_clustering_min_size: int = 3 # Minimum cluster size for Stage 3 grouping enable_staged_rerank: bool = True # Enable optional cross-encoder reranking in Stage 4 diff --git a/codex-lens/src/codexlens/lsp/lsp_bridge.py b/codex-lens/src/codexlens/lsp/lsp_bridge.py index 4f25b055..d3e2523b 100644 --- a/codex-lens/src/codexlens/lsp/lsp_bridge.py +++ b/codex-lens/src/codexlens/lsp/lsp_bridge.py @@ -20,6 +20,7 @@ from collections import OrderedDict from dataclasses import dataclass, field from pathlib import Path from typing import Any, Dict, List, Optional, TYPE_CHECKING +from urllib.parse import unquote if TYPE_CHECKING: from codexlens.lsp.standalone_manager import StandaloneLspManager @@ -62,12 +63,14 @@ class Location: """ # Handle VSCode URI format (file:///path/to/file) uri = data.get("uri", data.get("file_path", "")) - if uri.startswith("file:///"): - # Windows: file:///C:/path -> C:/path - # Unix: file:///path -> /path - file_path = uri[8:] if uri[8:9].isalpha() and uri[9:10] == ":" else uri[7:] - elif uri.startswith("file://"): - file_path = uri[7:] + if uri.startswith("file://"): + # Strip scheme and decode percent-encoding (e.g. file:///d%3A/...). + # Keep behavior compatible with both Windows and Unix paths. + raw = unquote(uri[7:]) # keep leading slash for Unix paths + # Windows: file:///C:/... or file:///c%3A/... -> C:/... + if raw.startswith("/") and len(raw) > 2 and raw[2] == ":": + raw = raw[1:] + file_path = raw else: file_path = uri diff --git a/codex-lens/src/codexlens/lsp/lsp_graph_builder.py b/codex-lens/src/codexlens/lsp/lsp_graph_builder.py index b5f42a75..a0ed381e 100644 --- a/codex-lens/src/codexlens/lsp/lsp_graph_builder.py +++ b/codex-lens/src/codexlens/lsp/lsp_graph_builder.py @@ -28,6 +28,7 @@ class LspGraphBuilder: max_depth: int = 2, max_nodes: int = 100, max_concurrent: int = 10, + resolve_symbols: bool = True, ): """Initialize GraphBuilder. @@ -35,10 +36,12 @@ class LspGraphBuilder: max_depth: Maximum depth for BFS expansion from seeds. max_nodes: Maximum number of nodes in the graph. max_concurrent: Maximum concurrent LSP requests. + resolve_symbols: If False, skip documentSymbol lookups and create lightweight nodes. """ self.max_depth = max_depth self.max_nodes = max_nodes self.max_concurrent = max_concurrent + self.resolve_symbols = resolve_symbols # Cache for document symbols per file (avoids per-location hover queries) self._document_symbols_cache: Dict[str, List[Dict[str, Any]]] = {} @@ -276,9 +279,11 @@ class LspGraphBuilder: start_line = location.line # Try to find symbol info from cached document symbols (fast) - symbol_info = await self._get_symbol_at_location( - file_path, start_line, lsp_bridge - ) + symbol_info = None + if self.resolve_symbols: + symbol_info = await self._get_symbol_at_location( + file_path, start_line, lsp_bridge + ) if symbol_info: name = symbol_info.get("name", f"symbol_L{start_line}") diff --git a/codex-lens/src/codexlens/search/chain_search.py b/codex-lens/src/codexlens/search/chain_search.py index 5a06b93c..d609ed26 100644 --- a/codex-lens/src/codexlens/search/chain_search.py +++ b/codex-lens/src/codexlens/search/chain_search.py @@ -1094,15 +1094,15 @@ class ChainSearchEngine: metadata = chunk.get("metadata") symbol_name = None symbol_kind = None - start_line = None - end_line = None + start_line = chunk.get("start_line") + end_line = chunk.get("end_line") if metadata: try: meta_dict = json.loads(metadata) if isinstance(metadata, str) else metadata symbol_name = meta_dict.get("symbol_name") symbol_kind = meta_dict.get("symbol_kind") - start_line = meta_dict.get("start_line") - end_line = meta_dict.get("end_line") + start_line = meta_dict.get("start_line", start_line) + end_line = meta_dict.get("end_line", end_line) except Exception: pass @@ -1130,10 +1130,11 @@ class ChainSearchEngine: coarse_results: List[SearchResult], index_root: Optional[Path], ) -> List[SearchResult]: - """Stage 2: LSP-based graph expansion using GraphExpander. + """Stage 2: LSP/graph expansion for staged cascade. - Expands coarse results with related symbols (definitions, references, - callers, callees) using precomputed graph neighbors. + Supports two modes via Config.staged_stage2_mode: + - "precomputed" (default): GraphExpander over per-dir `graph_neighbors` table + - "realtime": on-demand graph expansion via live LSP servers (LspBridge + LspGraphBuilder) Args: coarse_results: Results from Stage 1 binary search @@ -1146,44 +1147,14 @@ class ChainSearchEngine: return coarse_results try: - from codexlens.search.graph_expander import GraphExpander - - # Get expansion depth from config - depth = 2 + mode = "precomputed" if self._config is not None: - depth = getattr(self._config, "graph_expansion_depth", 2) + mode = (getattr(self._config, "staged_stage2_mode", "precomputed") or "precomputed").strip().lower() - expander = GraphExpander(self.mapper, config=self._config) + if mode in {"realtime", "live"}: + return self._stage2_realtime_lsp_expand(coarse_results, index_root=index_root) - # Expand top results (limit expansion to avoid explosion) - max_expand = min(10, len(coarse_results)) - max_related = 50 - - related_results = expander.expand( - coarse_results, - depth=depth, - max_expand=max_expand, - max_related=max_related, - ) - - if related_results: - self.logger.debug( - "Stage 2 expanded %d base results to %d related symbols", - len(coarse_results), len(related_results) - ) - - # Combine: original results + related results - # Keep original results first (higher relevance) - combined = list(coarse_results) - seen_keys = {(r.path, r.symbol_name, r.start_line) for r in coarse_results} - - for related in related_results: - key = (related.path, related.symbol_name, related.start_line) - if key not in seen_keys: - seen_keys.add(key) - combined.append(related) - - return combined + return self._stage2_precomputed_graph_expand(coarse_results, index_root=index_root) except ImportError as exc: self.logger.debug("GraphExpander not available: %s", exc) @@ -1192,6 +1163,238 @@ class ChainSearchEngine: self.logger.debug("Stage 2 LSP expansion failed: %s", exc) return coarse_results + def _stage2_precomputed_graph_expand( + self, + coarse_results: List[SearchResult], + *, + index_root: Path, + ) -> List[SearchResult]: + """Stage 2 (precomputed): expand using GraphExpander over `graph_neighbors`.""" + from codexlens.search.graph_expander import GraphExpander + + depth = 2 + if self._config is not None: + depth = getattr( + self._config, + "staged_lsp_depth", + getattr(self._config, "graph_expansion_depth", 2), + ) + try: + depth = int(depth) + except Exception: + depth = 2 + + expander = GraphExpander(self.mapper, config=self._config) + + max_expand = min(10, len(coarse_results)) + max_related = 50 + + related_results = expander.expand( + coarse_results, + depth=depth, + max_expand=max_expand, + max_related=max_related, + ) + + if related_results: + self.logger.debug( + "Stage 2 (precomputed) expanded %d base results to %d related symbols", + len(coarse_results), len(related_results) + ) + + return self._combine_stage2_results(coarse_results, related_results) + + def _stage2_realtime_lsp_expand( + self, + coarse_results: List[SearchResult], + *, + index_root: Path, + ) -> List[SearchResult]: + """Stage 2 (realtime): compute expansion graph via live LSP servers.""" + import asyncio + from concurrent.futures import ThreadPoolExecutor + + from codexlens.hybrid_search.data_structures import CodeSymbolNode, Range + from codexlens.lsp import LspBridge, LspGraphBuilder + + max_depth = 2 + timeout_s = 10.0 + max_nodes = 100 + warmup_s = 2.0 + resolve_symbols = False + if self._config is not None: + max_depth = int(getattr(self._config, "staged_lsp_depth", 2) or 2) + timeout_s = float(getattr(self._config, "staged_realtime_lsp_timeout_s", 10.0) or 10.0) + max_nodes = int(getattr(self._config, "staged_realtime_lsp_max_nodes", 100) or 100) + warmup_s = float(getattr(self._config, "staged_realtime_lsp_warmup_s", 2.0) or 0.0) + resolve_symbols = bool(getattr(self._config, "staged_realtime_lsp_resolve_symbols", False)) + + try: + source_root = self.mapper.index_to_source(index_root) + except Exception: + source_root = Path(coarse_results[0].path).resolve().parent + + workspace_root = self._find_lsp_workspace_root(source_root) + + max_expand = min(10, len(coarse_results)) + seed_nodes: List[CodeSymbolNode] = [] + seed_ids: set[str] = set() + + for seed in list(coarse_results)[:max_expand]: + if not seed.path: + continue + name = seed.symbol_name or Path(seed.path).stem + kind = seed.symbol_kind or "unknown" + start_line = int(seed.start_line or 1) + end_line = int(seed.end_line or start_line) + start_character = 1 + try: + if seed.symbol_name and start_line >= 1: + line_text = Path(seed.path).read_text(encoding="utf-8", errors="ignore").splitlines()[start_line - 1] + idx = line_text.find(seed.symbol_name) + if idx >= 0: + start_character = idx + 1 # 1-based for StandaloneLspManager + except Exception: + start_character = 1 + node_id = f"{seed.path}:{name}:{start_line}" + seed_ids.add(node_id) + seed_nodes.append( + CodeSymbolNode( + id=node_id, + name=name, + kind=kind, + file_path=seed.path, + range=Range( + start_line=start_line, + start_character=start_character, + end_line=end_line, + end_character=1, + ), + raw_code=seed.content or "", + docstring=seed.excerpt or "", + ) + ) + + if not seed_nodes: + return coarse_results + + async def expand_graph(): + async with LspBridge(workspace_root=str(workspace_root), timeout=timeout_s) as bridge: + # Warm up analysis: open seed docs and wait a bit so references/call hierarchy are populated. + if warmup_s > 0: + for seed in seed_nodes[:3]: + try: + await bridge.get_document_symbols(seed.file_path) + except Exception: + continue + try: + await asyncio.sleep(min(warmup_s, max(0.0, timeout_s - 0.5))) + except Exception: + pass + builder = LspGraphBuilder( + max_depth=max_depth, + max_nodes=max_nodes, + resolve_symbols=resolve_symbols, + ) + return await builder.build_from_seeds(seed_nodes, bridge) + + def run_coro_blocking(): + return asyncio.run(asyncio.wait_for(expand_graph(), timeout=timeout_s)) + + try: + try: + asyncio.get_running_loop() + has_running_loop = True + except RuntimeError: + has_running_loop = False + + if has_running_loop: + with ThreadPoolExecutor(max_workers=1) as executor: + graph = executor.submit(run_coro_blocking).result(timeout=timeout_s + 1.0) + else: + graph = run_coro_blocking() + except Exception as exc: + self.logger.debug("Stage 2 (realtime) expansion failed: %s", exc) + return coarse_results + + related_results: List[SearchResult] = [] + for node_id, node in getattr(graph, "nodes", {}).items(): + if node_id in seed_ids or getattr(node, "id", "") in seed_ids: + continue + + try: + start_line = int(getattr(node.range, "start_line", 1) or 1) + end_line = int(getattr(node.range, "end_line", start_line) or start_line) + except Exception: + start_line, end_line = 1, 1 + + related_results.append( + SearchResult( + path=node.file_path, + score=0.5, + excerpt=None, + content=getattr(node, "raw_code", "") or None, + symbol_name=node.name, + symbol_kind=node.kind, + start_line=start_line, + end_line=end_line, + metadata={"stage2_mode": "realtime", "lsp_node_id": node_id}, + ) + ) + + if related_results: + self.logger.debug( + "Stage 2 (realtime) expanded %d base results to %d related symbols", + len(coarse_results), len(related_results) + ) + + return self._combine_stage2_results(coarse_results, related_results) + + def _combine_stage2_results( + self, + coarse_results: List[SearchResult], + related_results: List[SearchResult], + ) -> List[SearchResult]: + combined = list(coarse_results) + seen_keys = {(r.path, r.symbol_name, r.start_line) for r in coarse_results} + + for related in related_results: + key = (related.path, related.symbol_name, related.start_line) + if key not in seen_keys: + seen_keys.add(key) + combined.append(related) + + return combined + + def _find_lsp_workspace_root(self, start_path: Path) -> Path: + """Best-effort workspace root selection for LSP initialization. + + Many language servers (e.g. Pyright) use workspace-relative include/exclude + patterns, so using a deep subdir (like "src") as root can break reference + and call-hierarchy queries. + """ + start = Path(start_path).resolve() + if start.is_file(): + start = start.parent + + # Prefer an explicit LSP config file in the workspace. + for current in [start, *list(start.parents)]: + try: + if (current / "lsp-servers.json").is_file(): + return current + except OSError: + continue + + # Fallback heuristics for project root markers. + for current in [start, *list(start.parents)]: + try: + if (current / ".git").exists() or (current / "pyproject.toml").is_file(): + return current + except OSError: + continue + + return start + def _stage3_cluster_prune( self, expanded_results: List[SearchResult], diff --git a/codex-lens/tests/test_stage1_binary_search_uses_chunk_lines.py b/codex-lens/tests/test_stage1_binary_search_uses_chunk_lines.py new file mode 100644 index 00000000..92d0cf6b --- /dev/null +++ b/codex-lens/tests/test_stage1_binary_search_uses_chunk_lines.py @@ -0,0 +1,65 @@ +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock, patch + +from codexlens.config import VECTORS_META_DB_NAME, Config +from codexlens.search.chain_search import ChainSearchEngine, SearchStats +from codexlens.storage.path_mapper import PathMapper +from codexlens.storage.registry import RegistryStore + + +def test_stage1_binary_search_prefers_chunk_start_line(tmp_path: Path) -> None: + registry = RegistryStore(db_path=tmp_path / "registry.db") + registry.initialize() + mapper = PathMapper(index_root=tmp_path / "indexes") + engine = ChainSearchEngine(registry, mapper, config=Config(data_dir=tmp_path / "data")) + + try: + index_root = tmp_path / "fake_index_root" + index_root.mkdir(parents=True, exist_ok=True) + index_db = index_root / "_index.db" + index_db.write_text("", encoding="utf-8") + (index_root / VECTORS_META_DB_NAME).write_text("", encoding="utf-8") + + class _DummyBinarySearcher: + def search(self, query_dense, top_k: int): + _ = query_dense + _ = top_k + return [(123, 10)] + + class _DummyEmbedder: + def embed_to_numpy(self, texts): + _ = texts + return [[0.0]] + + dummy_meta_store = MagicMock() + dummy_meta_store.get_chunks_by_ids.return_value = [ + { + "chunk_id": 123, + "file_path": str(tmp_path / "a.py"), + "content": "def a():\n return 1\n", + "start_line": 12, + "end_line": 14, + "metadata": {}, + "category": "code", + } + ] + + with patch.object(engine, "_get_centralized_binary_searcher", return_value=_DummyBinarySearcher()): + with patch("codexlens.search.chain_search.VectorMetadataStore", return_value=dummy_meta_store): + with patch("codexlens.semantic.embedder.Embedder", return_value=_DummyEmbedder()): + coarse_results, returned_root = engine._stage1_binary_search( + "a", + [index_db], + coarse_k=1, + stats=SearchStats(), + ) + + assert returned_root == index_root + assert len(coarse_results) == 1 + assert coarse_results[0].start_line == 12 + assert coarse_results[0].end_line == 14 + finally: + engine.close() + diff --git a/codex-lens/tests/test_staged_cascade_lsp_depth.py b/codex-lens/tests/test_staged_cascade_lsp_depth.py new file mode 100644 index 00000000..b7437ec8 --- /dev/null +++ b/codex-lens/tests/test_staged_cascade_lsp_depth.py @@ -0,0 +1,168 @@ +"""Regression tests for staged cascade Stage 2 expansion depth. + +Staged cascade is documented as: + coarse (binary) → LSP/graph expansion → clustering → optional rerank + +This test ensures Stage 2 respects Config.staged_lsp_depth (not unrelated +graph_expansion_depth settings). +""" + +from __future__ import annotations + +import tempfile +from pathlib import Path +from unittest.mock import patch + +import pytest + +from codexlens.config import Config +from codexlens.entities import CodeRelationship, RelationshipType, SearchResult, Symbol +from codexlens.search.chain_search import ChainSearchEngine +from codexlens.storage.dir_index import DirIndexStore +from codexlens.storage.index_tree import _compute_graph_neighbors +from codexlens.storage.path_mapper import PathMapper +from codexlens.storage.registry import RegistryStore + + +@pytest.fixture() +def temp_paths() -> Path: + tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) + root = Path(tmpdir.name) + yield root + try: + tmpdir.cleanup() + except (PermissionError, OSError): + pass + + +def _create_index_with_neighbors(root: Path) -> tuple[PathMapper, Path, Path, str]: + project_root = root / "project" + project_root.mkdir(parents=True, exist_ok=True) + + index_root = root / "indexes" + mapper = PathMapper(index_root=index_root) + index_db_path = mapper.source_to_index_db(project_root) + index_db_path.parent.mkdir(parents=True, exist_ok=True) + + # Use 3 files so staged_cascade_search's final "deduplicate by path" step + # doesn't collapse all expanded symbols into a single file result. + content_a = "\n".join(["def a():", " b()", ""]) + content_b = "\n".join(["def b():", " c()", ""]) + content_c = "\n".join(["def c():", " return 1", ""]) + + file_a = project_root / "a.py" + file_b = project_root / "b.py" + file_c = project_root / "c.py" + file_a.write_text(content_a, encoding="utf-8") + file_b.write_text(content_b, encoding="utf-8") + file_c.write_text(content_c, encoding="utf-8") + + symbols_a = [Symbol(name="a", kind="function", range=(1, 2), file=str(file_a))] + symbols_b = [Symbol(name="b", kind="function", range=(1, 2), file=str(file_b))] + symbols_c = [Symbol(name="c", kind="function", range=(1, 2), file=str(file_c))] + + relationships_a = [ + CodeRelationship( + source_symbol="a", + target_symbol="b", + relationship_type=RelationshipType.CALL, + source_file=str(file_a), + target_file=str(file_b), + source_line=2, + ) + ] + relationships_b = [ + CodeRelationship( + source_symbol="b", + target_symbol="c", + relationship_type=RelationshipType.CALL, + source_file=str(file_b), + target_file=str(file_c), + source_line=2, + ) + ] + + config = Config(data_dir=root / "data") + store = DirIndexStore(index_db_path, config=config) + store.initialize() + store.add_file( + name=file_a.name, + full_path=file_a, + content=content_a, + language="python", + symbols=symbols_a, + relationships=relationships_a, + ) + store.add_file( + name=file_b.name, + full_path=file_b, + content=content_b, + language="python", + symbols=symbols_b, + relationships=relationships_b, + ) + store.add_file( + name=file_c.name, + full_path=file_c, + content=content_c, + language="python", + symbols=symbols_c, + relationships=[], + ) + _compute_graph_neighbors(store) + store.close() + + return mapper, project_root, file_a, content_a + + +def test_staged_cascade_stage2_uses_staged_lsp_depth(temp_paths: Path) -> None: + mapper, project_root, file_path, content = _create_index_with_neighbors(temp_paths) + index_db_path = mapper.source_to_index_db(project_root) + + registry = RegistryStore(db_path=temp_paths / "registry.db") + registry.initialize() + + # Intentionally conflicting depths: staged_lsp_depth should win for staged cascade. + config = Config( + data_dir=temp_paths / "data", + staged_lsp_depth=1, + graph_expansion_depth=2, + enable_staged_rerank=False, + staged_clustering_strategy="noop", + ) + + engine = ChainSearchEngine(registry, mapper, config=config) + try: + base = SearchResult( + path=str(file_path.resolve()), + score=1.0, + excerpt="", + content=content, + start_line=1, + end_line=2, + symbol_name="a", + symbol_kind="function", + ) + + with patch("codexlens.search.chain_search.NUMPY_AVAILABLE", True): + with patch.object(engine, "_find_start_index", return_value=index_db_path): + with patch.object(engine, "_collect_index_paths", return_value=[index_db_path]): + # Bypass binary vector infrastructure; Stage 1 output is sufficient for Stage 2 behavior. + with patch.object( + engine, + "_stage1_binary_search", + return_value=([base], index_db_path.parent), + ): + result = engine.staged_cascade_search( + query="test", + source_path=project_root, + k=3, + coarse_k=10, + ) + + symbol_names = {r.symbol_name for r in result.results if r.symbol_name} + assert "b" in symbol_names + # With staged_lsp_depth=1, Stage 2 should NOT include 2-hop neighbor "c". + assert "c" not in symbol_names + finally: + engine.close() diff --git a/codex-lens/tests/test_staged_cascade_realtime_lsp.py b/codex-lens/tests/test_staged_cascade_realtime_lsp.py new file mode 100644 index 00000000..83fb6860 --- /dev/null +++ b/codex-lens/tests/test_staged_cascade_realtime_lsp.py @@ -0,0 +1,98 @@ +"""Unit tests for staged cascade Stage 2 realtime LSP graph expansion. + +These tests mock out the live LSP components (LspBridge + LspGraphBuilder) +so they can run without external language servers installed. +""" + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock, patch + +from codexlens.config import Config +from codexlens.entities import SearchResult +from codexlens.hybrid_search.data_structures import CodeAssociationGraph, CodeSymbolNode, Range +from codexlens.search.chain_search import ChainSearchEngine +from codexlens.storage.path_mapper import PathMapper +from codexlens.storage.registry import RegistryStore + + +class _DummyBridge: + def __init__(self, *args, **kwargs) -> None: + pass + + async def get_document_symbols(self, file_path: str): + _ = file_path + return [] + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb) -> None: + return None + + +def test_stage2_realtime_mode_expands_and_combines(tmp_path: Path) -> None: + registry = RegistryStore(db_path=tmp_path / "registry.db") + registry.initialize() + mapper = PathMapper(index_root=tmp_path / "indexes") + + config = Config( + data_dir=tmp_path / "data", + staged_stage2_mode="realtime", + staged_lsp_depth=1, + staged_realtime_lsp_timeout_s=1.0, + staged_realtime_lsp_max_nodes=10, + staged_realtime_lsp_warmup_s=0.0, + ) + + engine = ChainSearchEngine(registry, mapper, config=config) + try: + coarse = [ + SearchResult( + path=str(tmp_path / "a.py"), + score=1.0, + excerpt="def a(): pass", + content="def a():\n pass\n", + symbol_name="a", + symbol_kind="function", + start_line=1, + end_line=2, + ) + ] + + graph = CodeAssociationGraph() + seed_id = f"{coarse[0].path}:a:1" + graph.nodes[seed_id] = CodeSymbolNode( + id=seed_id, + name="a", + kind="function", + file_path=coarse[0].path, + range=Range(start_line=1, start_character=1, end_line=2, end_character=1), + ) + related_id = f"{str(tmp_path / 'b.py')}:b:1" + graph.nodes[related_id] = CodeSymbolNode( + id=related_id, + name="b", + kind="function", + file_path=str(tmp_path / "b.py"), + range=Range(start_line=1, start_character=1, end_line=1, end_character=1), + raw_code="def b():\n return 1\n", + ) + + dummy_builder = MagicMock() + dummy_builder.build_from_seeds = AsyncMock(return_value=graph) + + with patch("codexlens.lsp.LspBridge", _DummyBridge): + with patch("codexlens.lsp.LspGraphBuilder", return_value=dummy_builder) as mock_builder: + # Avoid needing a real index_to_source mapping + engine.mapper.index_to_source = MagicMock(return_value=tmp_path) + expanded = engine._stage2_lsp_expand(coarse, index_root=tmp_path / "fake_index_root") + + assert mock_builder.call_args is not None + assert mock_builder.call_args.kwargs.get("resolve_symbols") is False + names = {r.symbol_name for r in expanded if r.symbol_name} + assert "a" in names + assert "b" in names + finally: + engine.close() diff --git a/codex-lens/tests/unit/lsp/test_lsp_edge_cases.py b/codex-lens/tests/unit/lsp/test_lsp_edge_cases.py index b816008f..3f0cd4b0 100644 --- a/codex-lens/tests/unit/lsp/test_lsp_edge_cases.py +++ b/codex-lens/tests/unit/lsp/test_lsp_edge_cases.py @@ -760,6 +760,24 @@ class TestLocationParsing: assert loc.line == 1 assert loc.character == 1 + def test_location_from_file_uri_windows_percent_encoded_drive(self): + """Parse Location from percent-encoded Windows drive URIs (pyright-style).""" + from codexlens.lsp.lsp_bridge import Location + + data = { + "uri": "file:///d%3A/Claude_dms3/codex-lens/src/codexlens/api/semantic.py", + "range": { + "start": {"line": 18, "character": 3}, + "end": {"line": 18, "character": 10}, + }, + } + + loc = Location.from_lsp_response(data) + + assert loc.file_path == "d:/Claude_dms3/codex-lens/src/codexlens/api/semantic.py" + assert loc.line == 19 # 0-based -> 1-based + assert loc.character == 4 + def test_location_from_direct_fields(self): """Parse Location from direct field format.""" from codexlens.lsp.lsp_bridge import Location