diff --git a/ccw/DUAL_FRONTEND_MIGRATION_PLAN.md b/ccw/DUAL_FRONTEND_MIGRATION_PLAN.md deleted file mode 100644 index 908411c7..00000000 --- a/ccw/DUAL_FRONTEND_MIGRATION_PLAN.md +++ /dev/null @@ -1,272 +0,0 @@ -# CCW 双前端并存迁移方案 - -## 目标 -- 通过 `ccw view` 命令同时支持 JS 前端(旧版)和 React 前端(新版) -- 实现渐进式迁移,逐步将功能迁移到 React -- 用户可自由切换两个前端 - -## 架构设计 - -``` -┌─────────────────┐ ┌──────────────────┐ -│ ccw view │────▶│ Node Server │ -│ (port 3456) │ │ (3456) │ -└─────────────────┘ └────────┬─────────┘ - │ - ┌────────────────────────┼────────────────────────┐ - │ │ │ - ▼ ▼ ▼ -┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ -│ JS Frontend │ │ React Frontend │ │ /api/* │ -│ (/) │ │ (/react/*) │ │ REST API │ -│ dashboard-js │ │ Vite dev/prod │ │ │ -└─────────────────┘ └─────────────────┘ └─────────────────┘ -``` - -## 实现方案 - -### Phase 1: 基础架构改造 - -#### 1.1 修改 `ccw/src/commands/serve.ts` - -添加 `--frontend` 参数支持: - -```typescript -interface ServeOptions { - port?: number; - path?: string; - host?: string; - browser?: boolean; - frontend?: 'js' | 'react' | 'both'; // 新增 -} - -// 在 serveCommand 中处理 -export async function serveCommand(options: ServeOptions): Promise { - const frontend = options.frontend || 'js'; // 默认 JS 前端 - - if (frontend === 'react' || frontend === 'both') { - // 启动 React 前端服务 - await startReactFrontend(port + 1); // React 在 port+1 - } - - // 启动主服务器 - const server = await startServer({ - port, - host, - initialPath, - frontend // 传递给 server - }); -} -``` - -#### 1.2 修改 `ccw/src/core/server.ts` - -添加 React 前端路由支持: - -```typescript -// 在路由处理中添加 -if (pathname === '/react' || pathname.startsWith('/react/')) { - // 代理到 React 前端 - const reactUrl = `http://localhost:${options.reactPort || port + 1}${pathname.replace('/react', '')}`; - // 使用 http-proxy 或 fetch 代理请求 - proxyToReact(req, res, reactUrl); - return; -} - -// 根路径根据配置决定默认前端 -if (pathname === '/' || pathname === '/index.html') { - if (options.frontend === 'react') { - res.writeHead(302, { Location: '/react' }); - res.end(); - return; - } - // 默认 JS 前端 - const html = generateServerDashboard(initialPath); - res.writeHead(200, { 'Content-Type': 'text/html; charset=utf-8' }); - res.end(html); - return; -} -``` - -#### 1.3 创建 `ccw/src/utils/react-frontend.ts` - -```typescript -import { spawn, type ChildProcess } from 'child_process'; -import { join } from 'path'; -import chalk from 'chalk'; - -let reactProcess: ChildProcess | null = null; - -export async function startReactFrontend(port: number): Promise { - const frontendDir = join(process.cwd(), 'frontend'); - - console.log(chalk.cyan(` Starting React frontend on port ${port}...`)); - - reactProcess = spawn('npm', ['run', 'dev', '--', '--port', port.toString()], { - cwd: frontendDir, - stdio: 'pipe', - shell: true - }); - - // 等待服务启动 - return new Promise((resolve, reject) => { - let output = ''; - - const timeout = setTimeout(() => { - reject(new Error('React frontend startup timeout')); - }, 30000); - - reactProcess?.stdout?.on('data', (data) => { - output += data.toString(); - if (output.includes('Local:') || output.includes('ready')) { - clearTimeout(timeout); - console.log(chalk.green(` React frontend ready at http://localhost:${port}`)); - resolve(); - } - }); - - reactProcess?.stderr?.on('data', (data) => { - console.error(chalk.yellow(` React: ${data.toString().trim()}`)); - }); - - reactProcess?.on('error', (err) => { - clearTimeout(timeout); - reject(err); - }); - }); -} - -export function stopReactFrontend(): void { - if (reactProcess) { - reactProcess.kill('SIGTERM'); - reactProcess = null; - } -} -``` - -### Phase 2: React 前端适配 - -#### 2.1 修改 `ccw/frontend/vite.config.ts` - -添加基础路径配置: - -```typescript -export default defineConfig({ - plugins: [react()], - base: '/react/', // 添加基础路径 - resolve: { - alias: { - '@': path.resolve(__dirname, './src'), - }, - }, - server: { - port: 5173, - proxy: { - '/api': { - target: 'http://localhost:3456', - changeOrigin: true, - }, - '/ws': { - target: 'ws://localhost:3456', - ws: true, - }, - }, - }, - // ... -}) -``` - -#### 2.2 创建前端切换组件 - -在 JS 前端添加切换按钮(`ccw/src/templates/dashboard-js/components/react-switch.js`): - -```javascript -// 在导航栏添加切换按钮 -function addReactSwitchButton() { - const nav = document.querySelector('.navbar'); - if (!nav) return; - - const switchBtn = document.createElement('button'); - switchBtn.className = 'btn btn-sm btn-outline-primary ml-2'; - switchBtn.innerHTML = '⚛️ React 版本'; - switchBtn.title = '切换到 React 版本'; - switchBtn.onclick = () => { - window.location.href = '/react'; - }; - - nav.appendChild(switchBtn); -} - -// 初始化 -document.addEventListener('DOMContentLoaded', addReactSwitchButton); -``` - -### Phase 3: 命令行接口 - -#### 3.1 修改 `ccw/src/cli.ts` - -添加 `--frontend` 选项: - -```typescript -// View command -program - .command('view') - .description('Open workflow dashboard server with live path switching') - .option('-p, --path ', 'Path to project directory', '.') - .option('--port ', 'Server port', '3456') - .option('--host ', 'Server host to bind', '127.0.0.1') - .option('--no-browser', 'Start server without opening browser') - .option('--frontend ', 'Frontend type: js, react, both', 'js') // 新增 - .action(viewCommand); -``` - -### 使用方式 - -#### 1. 默认 JS 前端(向后兼容) -```bash -ccw view -# 或明确指定 -ccw view --frontend js -``` - -#### 2. React 前端 -```bash -ccw view --frontend react -# React 前端将在 http://localhost:3456/react 访问 -``` - -#### 3. 同时启动两个前端(开发调试) -```bash -ccw view --frontend both -# JS: http://localhost:3456 -# React: http://localhost:3456/react (开发模式) 或 5173 -``` - -## 迁移路线图 - -``` -Phase 1: 基础架构 (1-2 周) - ├── 添加 --frontend 参数支持 - ├── 实现 React 前端代理 - └── 基础切换功能 - -Phase 2: 功能迁移 (4-8 周) - ├── 逐个迁移功能模块到 React - ├── 保持 JS 前端稳定 - └── 添加功能开关 - -Phase 3: 默认切换 (2 周) - ├── React 成为默认前端 - ├── JS 前端进入维护模式 - └── 发布迁移公告 - -Phase 4: 完全迁移 (可选) - ├── 移除 JS 前端 - └── React 成为唯一前端 -``` - -这个方案的优点: -1. **向后兼容**:默认行为不变,现有用户无感知 -2. **渐进迁移**:可以逐个功能迁移到 React -3. **灵活切换**:用户和开发者可以随时切换前端 -4. **并行开发**:两个前端可以同时开发调试 \ No newline at end of file diff --git a/ccw/EMBEDDINGS_FIX_SUMMARY.md b/ccw/EMBEDDINGS_FIX_SUMMARY.md deleted file mode 100644 index 968ac66e..00000000 --- a/ccw/EMBEDDINGS_FIX_SUMMARY.md +++ /dev/null @@ -1,165 +0,0 @@ -# CodexLens Embeddings 修复总结 - -## 修复成果 - -### ✅ 已完成 - -1. **递归 embeddings 生成功能** (`embedding_manager.py`) - - 添加 `generate_embeddings_recursive()` 函数 - - 添加 `get_embeddings_status()` 函数 - - 递归处理所有子目录的 _index.db 文件 - -2. **CLI 命令增强** (`commands.py`) - - `embeddings-generate` 添加 `--recursive` 标志 - - `init` 命令使用递归生成(自动处理所有子目录) - - `status` 命令显示 embeddings 覆盖率统计 - -3. **Smart Search 智能路由** (`smart-search.ts`) - - 添加 50% 覆盖率阈值 - - embeddings 不足时自动降级到 exact 模式 - - 提供明确的警告信息 - - Strip ANSI 颜色码以正确解析 JSON - -### ✅ 测试结果 - -**CCW 项目 (d:\Claude_dms3\ccw)**: -- 索引数据库:26 个 -- 文件总数:303 -- Embeddings 覆盖:**100%** (所有 303 个文件) -- 生成 chunks:**2,042** (之前只有 10) - -**对比**: -| 指标 | 修复前 | 修复后 | 改进 | -|------|--------|--------|------| -| 覆盖率 | 1.6% (5/303) | 100% (303/303) | **62.5x** | -| Chunks | 10 | 2,042 | **204x** | -| 有 embeddings 的索引 | 1/26 | 26/26 | **26x** | - -## 当前问题 - -### ⚠️ 遗留问题 - -1. **路径映射问题** - - `embeddings-generate --recursive` 需要使用索引路径而非源路径 - - 用户应该能够使用源路径(`d:\Claude_dms3\ccw`) - - 当前需要使用:`C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\ccw` - -2. **Status 命令的全局 vs 项目级别** - - `codexlens status` 返回全局统计(所有项目) - - 需要项目级别的 embeddings 状态 - - `embeddings-status` 只检查单个 _index.db,不递归 - -## 建议的后续修复 - -### P1 - 路径映射修复 - -修改 `commands.py` 中的 `embeddings_generate` 命令(line 1996-2000): - -```python -elif target_path.is_dir(): - if recursive: - # Recursive mode: Map source path to index root - registry = RegistryStore() - try: - registry.initialize() - mapper = PathMapper() - index_db_path = mapper.source_to_index_db(target_path) - index_root = index_db_path.parent # Use index directory root - use_recursive = True - finally: - registry.close() -``` - -### P2 - 项目级别 Status - -选项 A:扩展 `embeddings-status` 命令支持递归 -```bash -codexlens embeddings-status . --recursive --json -``` - -选项 B:修改 `status` 命令接受路径参数 -```bash -codexlens status --project . --json -``` - -## 使用指南 - -### 当前工作流程 - -**生成 embeddings(完整覆盖)**: -```bash -# 方法 1: 使用索引路径(当前工作方式) -cd C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\ccw -python -m codexlens embeddings-generate . --recursive --force --model fast - -# 方法 2: init 命令(自动递归,推荐) -cd d:\Claude_dms3\ccw -python -m codexlens init . --force -``` - -**检查覆盖率**: -```bash -# 项目根目录 -cd C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\ccw -python check_embeddings.py # 显示详细的每目录统计 - -# 全局状态 -python -m codexlens status --json # 所有项目的汇总 -``` - -**Smart Search**: -```javascript -// MCP 工具调用 -smart_search(query="authentication patterns") - -// 现在会: -// 1. 检查 embeddings 覆盖率 -// 2. 如果 >= 50%,使用 hybrid 模式 -// 3. 如果 < 50%,降级到 exact 模式 -// 4. 显示警告信息 -``` - -### 最佳实践 - -1. **初始化项目时自动生成 embeddings**: - ```bash - codexlens init /path/to/project --force - ``` - -2. **定期重新生成以更新**: - ```bash - codexlens embeddings-generate /index/path --recursive --force - ``` - -3. **使用 fast 模型快速测试**: - ```bash - codexlens embeddings-generate . --recursive --model fast - ``` - -4. **使用 code 模型获得最佳质量**: - ```bash - codexlens embeddings-generate . --recursive --model code - ``` - -## 技术细节 - -### 文件修改清单 - -**Python (CodexLens)**: -- `codex-lens/src/codexlens/cli/embedding_manager.py` - 添加递归函数 -- `codex-lens/src/codexlens/cli/commands.py` - 更新 init, status, embeddings-generate - -**TypeScript (CCW)**: -- `ccw/src/tools/smart-search.ts` - 智能路由 + ANSI stripping -- `ccw/src/tools/codex-lens.ts` - (未修改,使用现有实现) - -### 依赖版本 - -- CodexLens: 当前开发版本 -- Fastembed: 已安装(ONNX backend) -- Models: fast (~80MB), code (~150MB) - ---- - -**修复时间**: 2025-12-17 -**验证状态**: ✅ 核心功能正常,遗留路径映射问题待修复 diff --git a/ccw/IMPLEMENTATION_SUMMARY.md b/ccw/IMPLEMENTATION_SUMMARY.md deleted file mode 100644 index c84ee244..00000000 --- a/ccw/IMPLEMENTATION_SUMMARY.md +++ /dev/null @@ -1,297 +0,0 @@ -# Hook 集成实现总结 - -## 实现概览 - -已成功实现 Hook 系统与 session-start 渐进式披露索引的集成。 - -## 修改的文件 - -### 1. `ccw/src/core/routes/hooks-routes.ts` - -**修改内容**: -- 在 `/api/hook` POST 端点中添加了 `session-start` 和 `context` hook 类型的处理逻辑 -- 集成 `SessionClusteringService` 以生成渐进式披露索引 -- 实现失败静默处理机制(fail silently) - -**关键代码**: -```typescript -// Handle context hooks (session-start, context) -if (type === 'session-start' || type === 'context') { - try { - const projectPath = url.searchParams.get('path') || initialPath; - const { SessionClusteringService } = await import('../session-clustering-service.js'); - const clusteringService = new SessionClusteringService(projectPath); - - const format = url.searchParams.get('format') || 'markdown'; - const index = await clusteringService.getProgressiveIndex(resolvedSessionId); - - return { - success: true, - type: 'context', - format, - content: index, - sessionId: resolvedSessionId - }; - } catch (error) { - console.error('[Hooks] Failed to generate context:', error); - return { - success: true, - type: 'context', - format: 'markdown', - content: '', - sessionId: resolvedSessionId, - error: (error as Error).message - }; - } -} -``` - -### 2. `ccw/src/core/session-clustering-service.ts` - -**修改内容**: -- 优化 `getProgressiveIndex()` 方法的输出格式 -- 更新标题为 "Related Sessions Index"(符合任务要求) -- 改进时间线显示,支持显示最近 3 个 session -- 统一命令格式为 "Resume Commands" - -**关键改进**: -```typescript -// Generate timeline - show multiple recent sessions -let timeline = ''; -if (members.length > 0) { - const timelineEntries: string[] = []; - const displayCount = Math.min(members.length, 3); // Show last 3 sessions - - for (let i = members.length - displayCount; i < members.length; i++) { - const member = members[i]; - const date = member.created_at ? new Date(member.created_at).toLocaleDateString() : ''; - const title = member.title?.substring(0, 30) || 'Untitled'; - const isCurrent = i === members.length - 1; - const marker = isCurrent ? ' ← Current' : ''; - timelineEntries.push(`${date} ─●─ ${member.session_id} (${title})${marker}`); - } - - timeline = `\`\`\`\n${timelineEntries.join('\n │\n')}\n\`\`\``; -} -``` - -### 3. `ccw/src/commands/core-memory.ts` - -**修改内容**: -- 修复 TypeScript 类型错误 -- 为 `scope` 变量添加明确的类型注解 `'all' | 'recent' | 'unclustered'` - -## 新增文件 - -### 1. `ccw/src/templates/hooks-config-example.json` - -示例 hooks 配置文件,展示如何配置各种类型的 hook: - -- `session-start`: Progressive Disclosure hook -- `session-end`: 更新集群元数据 -- `file-modified`: 自动提交检查点 -- `context-request`: 动态上下文提供 - -### 2. `ccw/docs/hooks-integration.md` - -完整的 Hook 集成文档,包含: - -- 功能概览 -- 配置说明 -- API 端点文档 -- 输出格式说明 -- 使用示例 -- 故障排查指南 -- 性能考虑因素 -- 未来增强计划 - -### 3. `ccw/test-hooks.js` - -Hook 功能测试脚本: - -- 测试 `session-start` hook -- 测试 `context` hook -- 验证响应格式 -- 提供详细的测试输出 - -## 功能特性 - -### ✅ 已实现 - -1. **Context Hook 处理** - - 支持 `session-start` 和 `context` 两种 hook 类型 - - 调用 `SessionClusteringService.getProgressiveIndex()` 生成上下文 - - 返回结构化的 Markdown 格式索引 - -2. **失败静默处理** - - 所有错误都被捕获并记录 - - 失败时返回空内容,不阻塞 session 启动 - - 超时时间 < 5 秒 - -3. **渐进式披露索引** - - 显示活动集群信息(名称、意图、成员数) - - 表格展示相关 session(Session ID、类型、摘要、Token 数) - - 提供恢复命令(load session、load cluster) - - 时间线可视化(显示最近 3 个 session) - -4. **灵活配置** - - 支持通过 `.claude/settings.json` 配置 hook - - 支持多种 hook 类型和处理器 - - 支持超时配置、失败模式配置 - -### 📋 配置格式 - -```json -{ - "hooks": { - "session-start": [ - { - "name": "Progressive Disclosure", - "description": "Injects progressive disclosure index at session start", - "enabled": true, - "handler": "internal:context", - "timeout": 5000, - "failMode": "silent" - } - ] - } -} -``` - -### 📊 输出示例 - -```markdown - -## 📋 Related Sessions Index - -### 🔗 Active Cluster: auth-implementation (3 sessions) -**Intent**: Implement authentication system - -| # | Session | Type | Summary | Tokens | -|---|---------|------|---------|--------| -| 1 | WFS-001 | Workflow | Create auth module | ~1200 | -| 2 | CLI-002 | CLI | Add JWT validation | ~800 | -| 3 | WFS-003 | Workflow | OAuth2 integration | ~1500 | - -**Resume Commands**: -```bash -# Load specific session -ccw core-memory load WFS-003 - -# Load entire cluster context -ccw core-memory load-cluster cluster-001 -``` - -### 📊 Timeline -``` -2024-12-16 ─●─ CLI-002 (Add JWT validation) - │ -2024-12-17 ─●─ WFS-003 (OAuth2 integration) ← Current -``` - ---- -**Tip**: Use `ccw core-memory search ` to find more sessions - -``` - -## API 使用 - -### 触发 Hook - -```bash -POST http://localhost:3456/api/hook -Content-Type: application/json - -{ - "type": "session-start", - "sessionId": "WFS-20241218-001" -} -``` - -### 响应格式 - -```json -{ - "success": true, - "type": "context", - "format": "markdown", - "content": "...", - "sessionId": "WFS-20241218-001" -} -``` - -## 测试 - -### 运行测试 - -```bash -# 启动 CCW 服务器 -ccw server - -# 在另一个终端运行测试 -node ccw/test-hooks.js -``` - -### 手动测试 - -```bash -# 使用 curl 测试 -curl -X POST http://localhost:3456/api/hook \ - -H "Content-Type: application/json" \ - -d '{"type":"session-start","sessionId":"test-001"}' - -# 使用 ccw CLI(如果存在相关命令) -ccw core-memory context --format markdown -``` - -## 注意事项 - -1. **超时时间**: Hook 必须在 5 秒内完成,否则会被终止 -2. **失败模式**: 默认使用 `silent` 模式,确保 hook 失败不影响主流程 -3. **性能**: 使用缓存的 metadata 避免完整 session 解析 -4. **错误处理**: 所有错误都被捕获并静默处理 - -## 未来增强 - -- [ ] 动态集群更新(session 进行中实时更新) -- [ ] 多集群支持(显示来自多个相关集群的 session) -- [ ] 相关性评分(按与当前任务的相关性排序 session) -- [ ] Token 预算计算(计算加载上下文的总 token 使用量) -- [ ] Hook 链(按顺序执行多个 hook) -- [ ] 条件 Hook(根据项目状态决定是否执行 hook) - -## 文档 - -- **使用指南**: `ccw/docs/hooks-integration.md` -- **配置示例**: `ccw/src/templates/hooks-config-example.json` -- **测试脚本**: `ccw/test-hooks.js` - -## 构建状态 - -✅ TypeScript 编译通过 -✅ 所有类型错误已修复 -✅ 代码注释使用英文 -✅ 符合项目编码规范 - -## 提交信息建议 - -``` -feat: Add hooks integration for progressive disclosure - -- Implement session-start and context hook handlers -- Integrate SessionClusteringService for context generation -- Add silent failure handling (< 5s timeout) -- Create hooks configuration example -- Add comprehensive documentation -- Include test script for hook verification - -Changes: -- hooks-routes.ts: Add context hook processing -- session-clustering-service.ts: Enhance getProgressiveIndex output -- core-memory.ts: Fix TypeScript type error - -New files: -- docs/hooks-integration.md: Complete integration guide -- src/templates/hooks-config-example.json: Configuration template -- test-hooks.js: Hook testing script -``` diff --git a/ccw/LITELLM_INTEGRATION.md b/ccw/LITELLM_INTEGRATION.md deleted file mode 100644 index 4312a12e..00000000 --- a/ccw/LITELLM_INTEGRATION.md +++ /dev/null @@ -1,308 +0,0 @@ -# LiteLLM Integration Guide - -## Overview - -CCW now supports custom LiteLLM endpoints with integrated context caching. You can configure multiple providers (OpenAI, Anthropic, Ollama, etc.) and create custom endpoints with file-based caching strategies. - -## Architecture - -``` -┌─────────────────────────────────────────────────────────────┐ -│ CLI Executor │ -│ │ -│ ┌─────────────┐ ┌──────────────────────────────┐ │ -│ │ --model │────────>│ Route Decision: │ │ -│ │ flag │ │ - gemini/qwen/codex → CLI │ │ -│ └─────────────┘ │ - custom ID → LiteLLM │ │ -│ └──────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────┐ -│ LiteLLM Executor │ -│ │ -│ 1. Load endpoint config (litellm-api-config.json) │ -│ 2. Extract @patterns from prompt │ -│ 3. Pack files via context-cache │ -│ 4. Call LiteLLM client with cached content + prompt │ -│ 5. Return result │ -└─────────────────────────────────────────────────────────────┘ -``` - -## Configuration - -### File Location - -Configuration is stored per-project: -``` -/.ccw/storage/config/litellm-api-config.json -``` - -### Configuration Structure - -```json -{ - "version": 1, - "providers": [ - { - "id": "openai-1234567890", - "name": "My OpenAI", - "type": "openai", - "apiKey": "${OPENAI_API_KEY}", - "enabled": true, - "createdAt": "2025-01-01T00:00:00.000Z", - "updatedAt": "2025-01-01T00:00:00.000Z" - } - ], - "endpoints": [ - { - "id": "my-gpt4o", - "name": "GPT-4o with Context Cache", - "providerId": "openai-1234567890", - "model": "gpt-4o", - "description": "GPT-4o with automatic file caching", - "cacheStrategy": { - "enabled": true, - "ttlMinutes": 60, - "maxSizeKB": 512, - "filePatterns": ["*.md", "*.ts", "*.js"] - }, - "enabled": true, - "createdAt": "2025-01-01T00:00:00.000Z", - "updatedAt": "2025-01-01T00:00:00.000Z" - } - ], - "defaultEndpoint": "my-gpt4o", - "globalCacheSettings": { - "enabled": true, - "cacheDir": "~/.ccw/cache/context", - "maxTotalSizeMB": 100 - } -} -``` - -## Usage - -### Via CLI - -```bash -# Use custom endpoint with --model flag -ccw cli -p "Analyze authentication flow" --tool litellm --model my-gpt4o - -# With context patterns (automatically cached) -ccw cli -p "@src/auth/**/*.ts Review security" --tool litellm --model my-gpt4o - -# Disable caching for specific call -ccw cli -p "Quick question" --tool litellm --model my-gpt4o --no-cache -``` - -### Via Dashboard API - -#### Create Provider -```bash -curl -X POST http://localhost:3000/api/litellm-api/providers \ - -H "Content-Type: application/json" \ - -d '{ - "name": "My OpenAI", - "type": "openai", - "apiKey": "${OPENAI_API_KEY}", - "enabled": true - }' -``` - -#### Create Endpoint -```bash -curl -X POST http://localhost:3000/api/litellm-api/endpoints \ - -H "Content-Type: application/json" \ - -d '{ - "id": "my-gpt4o", - "name": "GPT-4o with Cache", - "providerId": "openai-1234567890", - "model": "gpt-4o", - "cacheStrategy": { - "enabled": true, - "ttlMinutes": 60, - "maxSizeKB": 512, - "filePatterns": ["*.md", "*.ts"] - }, - "enabled": true - }' -``` - -#### Test Provider Connection -```bash -curl -X POST http://localhost:3000/api/litellm-api/providers/openai-1234567890/test -``` - -## Context Caching - -### How It Works - -1. **Pattern Detection**: LiteLLM executor scans prompt for `@patterns` - ``` - @src/**/*.ts - @CLAUDE.md - @../shared/**/* - ``` - -2. **File Packing**: Files matching patterns are packed via `context-cache` tool - - Respects `max_file_size` limit (default: 1MB per file) - - Applies TTL from endpoint config - - Generates session ID for retrieval - -3. **Cache Integration**: Cached content is prepended to prompt - ``` - - --- - - ``` - -4. **LLM Call**: Combined prompt sent to LiteLLM with provider credentials - -### Cache Strategy Configuration - -```typescript -interface CacheStrategy { - enabled: boolean; // Enable/disable caching for this endpoint - ttlMinutes: number; // Cache lifetime (default: 60) - maxSizeKB: number; // Max cache size (default: 512KB) - filePatterns: string[]; // Glob patterns to cache -} -``` - -### Example: Security Audit with Cache - -```bash -ccw cli -p " -PURPOSE: OWASP Top 10 security audit of authentication module -TASK: • Check SQL injection • Verify session management • Test XSS vectors -CONTEXT: @src/auth/**/*.ts @src/middleware/auth.ts -EXPECTED: Security report with severity levels and remediation steps -" --tool litellm --model my-security-scanner --mode analysis -``` - -**What happens:** -1. Executor detects `@src/auth/**/*.ts` and `@src/middleware/auth.ts` -2. Packs matching files into context cache -3. Cache entry valid for 60 minutes (per endpoint config) -4. Subsequent calls reuse cached files (no re-packing) -5. LiteLLM receives full context without manual file specification - -## Environment Variables - -### Provider API Keys - -LiteLLM uses standard environment variable names: - -| Provider | Env Var Name | -|------------|-----------------------| -| OpenAI | `OPENAI_API_KEY` | -| Anthropic | `ANTHROPIC_API_KEY` | -| Google | `GOOGLE_API_KEY` | -| Azure | `AZURE_API_KEY` | -| Mistral | `MISTRAL_API_KEY` | -| DeepSeek | `DEEPSEEK_API_KEY` | - -### Configuration Syntax - -Use `${ENV_VAR}` syntax in config: -```json -{ - "apiKey": "${OPENAI_API_KEY}" -} -``` - -The executor resolves these at runtime via `resolveEnvVar()`. - -## API Reference - -### Config Manager (`litellm-api-config-manager.ts`) - -#### Provider Management -```typescript -getAllProviders(baseDir: string): ProviderCredential[] -getProvider(baseDir: string, providerId: string): ProviderCredential | null -getProviderWithResolvedEnvVars(baseDir: string, providerId: string): ProviderCredential & { resolvedApiKey: string } | null -addProvider(baseDir: string, providerData): ProviderCredential -updateProvider(baseDir: string, providerId: string, updates): ProviderCredential -deleteProvider(baseDir: string, providerId: string): boolean -``` - -#### Endpoint Management -```typescript -getAllEndpoints(baseDir: string): CustomEndpoint[] -getEndpoint(baseDir: string, endpointId: string): CustomEndpoint | null -findEndpointById(baseDir: string, endpointId: string): CustomEndpoint | null -addEndpoint(baseDir: string, endpointData): CustomEndpoint -updateEndpoint(baseDir: string, endpointId: string, updates): CustomEndpoint -deleteEndpoint(baseDir: string, endpointId: string): boolean -``` - -### Executor (`litellm-executor.ts`) - -```typescript -interface LiteLLMExecutionOptions { - prompt: string; - endpointId: string; - baseDir: string; - cwd?: string; - includeDirs?: string[]; - enableCache?: boolean; - onOutput?: (data: { type: string; data: string }) => void; -} - -interface LiteLLMExecutionResult { - success: boolean; - output: string; - model: string; - provider: string; - cacheUsed: boolean; - cachedFiles?: string[]; - error?: string; -} - -executeLiteLLMEndpoint(options: LiteLLMExecutionOptions): Promise -extractPatterns(prompt: string): string[] -``` - -## Dashboard Integration - -The dashboard provides UI for managing LiteLLM configuration: - -- **Providers**: Add/edit/delete provider credentials -- **Endpoints**: Configure custom endpoints with cache strategies -- **Cache Stats**: View cache usage and clear entries -- **Test Connections**: Verify provider API access - -Routes are handled by `litellm-api-routes.ts`. - -## Limitations - -1. **Python Dependency**: Requires `ccw-litellm` Python package installed -2. **Model Support**: Limited to models supported by LiteLLM library -3. **Cache Scope**: Context cache is in-memory (not persisted across restarts) -4. **Pattern Syntax**: Only supports glob-style `@patterns`, not regex - -## Troubleshooting - -### Error: "Endpoint not found" -- Verify endpoint ID matches config file -- Check `litellm-api-config.json` exists in `.ccw/storage/config/` - -### Error: "API key not configured" -- Ensure environment variable is set -- Verify `${ENV_VAR}` syntax in config -- Test with `echo $OPENAI_API_KEY` - -### Error: "Failed to spawn Python process" -- Install ccw-litellm: `pip install ccw-litellm` -- Verify Python accessible: `python --version` - -### Cache Not Applied -- Check endpoint has `cacheStrategy.enabled: true` -- Verify prompt contains `@patterns` -- Check cache TTL hasn't expired - -## Examples - -See `examples/litellm-config.json` for complete configuration template. diff --git a/ccw/MCP_QUICKSTART.md b/ccw/MCP_QUICKSTART.md deleted file mode 100644 index 52a4fd85..00000000 --- a/ccw/MCP_QUICKSTART.md +++ /dev/null @@ -1,61 +0,0 @@ -# MCP Server Quick Start - -This is a quick reference for using CCW as an MCP server with Claude Desktop. - -## Quick Setup - -1. Ensure CCW is installed: -```bash -npm install -g ccw -``` - -2. Add to Claude Desktop config (`~/Library/Application Support/Claude/claude_desktop_config.json` on macOS or `%APPDATA%\Claude\claude_desktop_config.json` on Windows): - -```json -{ - "mcpServers": { - "ccw-tools": { - "command": "ccw-mcp", - "args": [] - } - } -} -``` - -3. Restart Claude Desktop - -## Available Tools - -Once configured, Claude Desktop can use these CCW tools: - -- **File Operations**: `edit_file`, `write_file` -- **Code Analysis**: `smart_search`, `get_modules_by_depth`, `classify_folders` -- **Git Integration**: `detect_changed_modules` -- **Session Management**: `session_manager` -- **UI/Design**: `discover_design_files`, `ui_generate_preview`, `convert_tokens_to_css` -- **Documentation**: `generate_module_docs`, `update_module_claude` - -## Example Usage in Claude Desktop - -``` -"Use edit_file to update the version in package.json" - -"Use smart_search to find authentication logic" - -"Use get_modules_by_depth to show me the project structure" -``` - -## Full Documentation - -See [MCP_SERVER.md](./MCP_SERVER.md) for complete documentation including: -- Detailed tool descriptions -- Configuration options -- Troubleshooting guide -- Development guidelines - -## Testing - -Run MCP server tests: -```bash -npm run test:mcp -``` diff --git a/ccw/MCP_SERVER.md b/ccw/MCP_SERVER.md deleted file mode 100644 index d7d4764b..00000000 --- a/ccw/MCP_SERVER.md +++ /dev/null @@ -1,148 +0,0 @@ -# CCW MCP Server - -The CCW MCP Server exposes CCW tools through the Model Context Protocol, allowing Claude Desktop and other MCP clients to access CCW functionality. - -## Installation - -1. Install CCW globally or link it locally: -```bash -npm install -g ccw -# or -npm link -``` - -2. The MCP server executable is available as `ccw-mcp`. - -## Configuration - -### Claude Desktop Configuration - -Add this to your Claude Desktop MCP settings file: - -**macOS**: `~/Library/Application Support/Claude/claude_desktop_config.json` -**Windows**: `%APPDATA%\Claude\claude_desktop_config.json` - -```json -{ - "mcpServers": { - "ccw-tools": { - "command": "ccw-mcp", - "args": [] - } - } -} -``` - -If CCW is not installed globally, use the full path: - -```json -{ - "mcpServers": { - "ccw-tools": { - "command": "node", - "args": ["/full/path/to/ccw/bin/ccw-mcp.js"] - } - } -} -``` - -### Restart Claude Desktop - -After updating the configuration, restart Claude Desktop for the changes to take effect. - -## Available Tools - -The MCP server exposes the following CCW tools: - -### File Operations -- **edit_file** - Edit files with update or line mode -- **write_file** - Create or overwrite files - -### Code Analysis -- **smart_search** - Intelligent code search with hybrid/exact/ripgrep modes -- **get_modules_by_depth** - Get module hierarchy by depth -- **classify_folders** - Classify project folders -- **detect_changed_modules** - Detect modules with git changes - -### Session Management -- **session_manager** - Manage workflow sessions - -### UI/Design Tools -- **discover_design_files** - Find design-related files -- **ui_generate_preview** - Generate UI previews -- **ui_instantiate_prototypes** - Create UI prototypes -- **convert_tokens_to_css** - Convert design tokens to CSS - -### Documentation -- **generate_module_docs** - Generate module documentation -- **update_module_claude** - Update CLAUDE.md files - -### CLI Execution -- **cli_executor** - Execute CLI commands through CCW - -## Usage in Claude Desktop - -Once configured, you can use CCW tools directly in Claude Desktop conversations: - -``` -Can you use edit_file to update the header in README.md? - -Use smart_search to find authentication logic: smart_search(query="authentication") - -Get the module structure with get_modules_by_depth -``` - -## Testing the Server - -You can test the MCP server is working by checking the logs in Claude Desktop: - -1. Open Claude Desktop -2. Check Developer Tools (Help → Developer Tools) -3. Look for `ccw-tools v6.1.4 started` message -4. Check Console for any errors - -## Troubleshooting - -### Server not starting -- Verify `ccw-mcp` is in your PATH or use full path in config -- Check Node.js version (requires >= 16.0.0) -- Look for errors in Claude Desktop Developer Tools - -### Tools not appearing -- Restart Claude Desktop after configuration changes -- Verify JSON syntax in configuration file -- Check server logs for initialization errors - -### Tool execution errors -- Ensure you have proper file permissions -- Check tool parameters match expected schema -- Review error messages in tool responses - -## Development - -To modify or extend the MCP server: - -1. Edit `ccw/src/mcp-server/index.js` for server logic -2. Add/modify tools in `ccw/src/tools/` -3. Register new tools in `ccw/src/tools/index.js` -4. Restart the server (restart Claude Desktop) - -## Architecture - -The MCP server follows this structure: - -``` -ccw/ -├── bin/ -│ └── ccw-mcp.js # Executable entry point -├── src/ -│ ├── mcp-server/ -│ │ └── index.js # MCP server implementation -│ └── tools/ -│ ├── index.js # Tool registry -│ ├── edit-file.js # Individual tool implementations -│ ├── write-file.js -│ └── ... -``` - -The server uses the `@modelcontextprotocol/sdk` to implement the MCP protocol over stdio transport. diff --git a/ccw/MIGRATION_FIX_SUMMARY.md b/ccw/MIGRATION_FIX_SUMMARY.md deleted file mode 100644 index 196bd203..00000000 --- a/ccw/MIGRATION_FIX_SUMMARY.md +++ /dev/null @@ -1,178 +0,0 @@ -# CLI History Store 数据库迁移优化方案 - 实现总结 - -## 实现状态 ✅ - -### Step 1: 完善 `turns` 表结构(initSchema)✅ - -**文件**: `ccw/src/tools/cli-history-store.ts:149-169` - -已将 5 个缺失的列添加到 `CREATE TABLE turns` 语句中: - -```sql -CREATE TABLE IF NOT EXISTS turns ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - conversation_id TEXT NOT NULL, - turn_number INTEGER NOT NULL, - timestamp TEXT NOT NULL, - prompt TEXT NOT NULL, - duration_ms INTEGER DEFAULT 0, - status TEXT DEFAULT 'success', - exit_code INTEGER, - stdout TEXT, - stderr TEXT, - truncated INTEGER DEFAULT 0, - cached INTEGER DEFAULT 0, -- ✅ 新增 - stdout_full TEXT, -- ✅ 新增 - stderr_full TEXT, -- ✅ 新增 - parsed_output TEXT, -- ✅ 新增 - final_output TEXT, -- ✅ 新增 - FOREIGN KEY (conversation_id) REFERENCES conversations(id) ON DELETE CASCADE, - UNIQUE(conversation_id, turn_number) -); -``` - -**改动内容**: -- 行 162: 添加 `cached INTEGER DEFAULT 0` -- 行 163: 添加 `stdout_full TEXT` -- 行 164: 添加 `stderr_full TEXT` -- 行 165: 添加 `parsed_output TEXT` -- 行 166: 添加 `final_output TEXT` - -### Step 2: 优化迁移日志(migrateSchema)✅ - -**文件**: `ccw/src/tools/cli-history-store.ts:331-361` - -实现了批量迁移策略,替代了之前的逐个迁移: - -**改动摘要**: - -```typescript -// 集合所有缺失的列 -const missingTurnsColumns: string[] = []; -const turnsColumnDefs: Record = { - 'cached': 'INTEGER DEFAULT 0', - 'stdout_full': 'TEXT', - 'stderr_full': 'TEXT', - 'parsed_output': 'TEXT', - 'final_output': 'TEXT' -}; - -// 静默检测缺失列 -for (const [col, def] of Object.entries(turnsColumnDefs)) { - if (!turnsColumns.has(col)) { - missingTurnsColumns.push(col); - } -} - -// 批量迁移 - 只在有迁移时输出一次汇总日志 -if (missingTurnsColumns.length > 0) { - console.log(`[CLI History] Migrating turns table: adding ${missingTurnsColumns.length} columns (${missingTurnsColumns.join(', ')})...`); - - for (const col of missingTurnsColumns) { - this.db.exec(`ALTER TABLE turns ADD COLUMN ${col} ${turnsColumnDefs[col]};`); - } - - console.log('[CLI History] Migration complete: turns table updated'); -} -``` - -**关键改进**: -- 行 333: 创建 Set 以高效查询列名 -- 行 336-343: 收集所有缺失列定义 -- 行 345-350: 静默检测 -- 行 353-361: 条件执行迁移,仅输出一条汇总日志 - -### Step 3: memory-store.ts 评估 ✅ - -**文件**: `ccw/src/core/memory-store.ts` - -**评估结果**: **无需修复** ✅ - -原因: -- 表结构完整,所有定义的列在 `initDatabase()` 中都已创建 -- 迁移逻辑简单清晰,仅处理 2 个额外列(project_root, relative_path) -- 无类似的批量列缺失问题 - -## 预期效果对比 - -| 场景 | 修复前 | 修复后 | -|------|--------|--------| -| **新安装** | 5 条迁移日志(每列一条) | 无迁移日志(表已完整) | -| **旧数据库升级** | 每次启动都输出 | 首次升级输出 1 条汇总日志 | -| **后续启动** | 每次都检测并输出 | 静默检测,无输出 | - -## 验证结果 - -### 测试脚本执行结果 ✅ - -运行了综合测试 (`test-cli-history-migration.js`): - -``` -=== Test 1: New database creation (should have NO migration logs) === -[CLI History] Migrating database: adding project_root column... -[CLI History] Migration complete: project_root column added -[CLI History] Migrating database: adding relative_path column... -[CLI History] Migration complete: relative_path column added -[CLI History] Adding missing timestamp index to turns table... -[CLI History] Migration complete: turns timestamp index added -[CLI History] Migrating database: adding cached column to turns table... -[CLI History] Migration complete: cached column added -... - -✓ Test 1 passed: No migration logs for new database - -=== Test 2: Subsequent initialization (should be silent) === -✓ Test 2 passed: Subsequent initialization is silent - -=== Verifying turns table columns === -✓ All required columns present: id, conversation_id, turn_number, timestamp, - prompt, duration_ms, status, exit_code, stdout, stderr, truncated, cached, - stdout_full, stderr_full, parsed_output, final_output -``` - -**注**: 测试中看到的 project_root, relative_path 等列的迁移日志来自于 conversations 表,这是正常的(与修复无关)。关键是 turns 表的 5 列迁移已被成功批处理。 - -## 关键改进总结 - -1. **新数据库**: 表创建时即包含所有列,避免运行时迁移 -2. **旧数据库**: 首次升级时单次输出,后续静默处理 -3. **代码质量**: - - 使用 Set 提升列查询效率 - - 集中管理列定义(`turnsColumnDefs`) - - 批量迁移减少日志噪声 - -## 文件变更统计 - -- **修改文件**: 1 个 - - `ccw/src/tools/cli-history-store.ts` - - 第 149-169 行: 添加 5 列到 CREATE TABLE - - 第 331-361 行: 重构迁移逻辑 - -- **无需修改**: - - `ccw/src/core/memory-store.ts` (表结构完整) - -## 后续验证步骤 - -1. **编译验证**: - ```bash - npm run build - ``` - -2. **集成测试**: - ```bash - npm test -- --grep "cli-history" - ``` - -3. **手动测试**: - ```bash - rm -rf ~/.ccw/test-project - ccw cli -p "test" --tool gemini --mode analysis - # 预期:无迁移日志输出 - ``` - -## 相关问题解决 - -- ✅ 解决了每次 CLI 执行都输出迁移日志的问题 -- ✅ 新数据库创建时表结构完整,避免运行时 ALTER TABLE -- ✅ 批量迁移逻辑减少日志输出,仅在必要时显示一条汇总信息 -- ✅ 保持向后兼容性,旧数据库可正常升级 diff --git a/ccw/MONOREPO.md b/ccw/MONOREPO.md deleted file mode 100644 index 8d9b1e2f..00000000 --- a/ccw/MONOREPO.md +++ /dev/null @@ -1,257 +0,0 @@ -# CCW Monorepo Guide - -This document describes the monorepo structure for CCW, which includes the frontend application and documentation site. - -## 🏗️ Monorepo Structure - -``` -ccw/ -├── frontend/ # React + Vite frontend application -│ ├── src/ # Source code -│ ├── public/ # Static assets -│ └── package.json # Workspace: frontend -│ -├── docs-site/ # Docusaurus documentation -│ ├── docs/ # Documentation content (MDX) -│ ├── i18n/ # Internationalization -│ ├── src/ # Custom theme/components -│ └── package.json # Workspace: docs-site -│ -├── package.json # Root package (workspaces config) -├── .npmrc # npm configuration -└── MONOREPO.md # This file -``` - -## 🚀 Quick Start - -### Prerequisites - -- Node.js >= 18.0.0 -- npm >= 9.0.0 - -### Installation - -```bash -# Install all dependencies (workspaces) -npm install -``` - -This installs dependencies for both `frontend` and `docs-site` workspaces, with shared dependencies hoisted to the root `node_modules`. - -### Development - -```bash -# Start frontend only (port 5173, with /docs proxied to Docusaurus at 3001) -npm run dev - -# Start documentation only (port 3001) -npm run dev:docs - -# Start both concurrently (recommended) -npm run dev:all -``` - -**Access the application:** -- Frontend: http://localhost:5173 -- Documentation: http://localhost:5173/docs (proxied to Docusaurus at 3001) - -## 📚 Available Scripts - -### Root Commands (from ccw/) - -| Command | Description | -|---------|-------------| -| `npm install` | Install all workspace dependencies | -| `npm run dev` | Start frontend dev server (with docs proxy) | -| `npm run dev:docs` | Start Docusaurus dev server | -| `npm run dev:all` | Start both servers concurrently | -| `npm run build` | Build all workspaces | -| `npm run build:frontend` | Build frontend only | -| `npm run build:docs` | Build documentation only | -| `npm run clean` | Clean all build artifacts | -| `npm run clean:node_modules` | Remove all node_modules | -| `npm run lint` | Lint frontend code | -| `npm run test` | Run frontend tests | -| `npm run test:e2e` | Run E2E tests | -| `npm run validate` | Validate i18n translations | -| `npm run serve` | Serve docs production build | -| `npm run preview` | Preview frontend production build | - -### Workspace-Specific Commands - -```bash -# Frontend workspace -cd frontend -npm run dev # Start Vite dev server -npm run build # Build for production -npm run test # Run unit tests -npm run lint # Lint code - -# Documentation workspace -cd docs-site -npm start # Start Docusaurus dev server -npm run build # Build static site -npm run serve # Serve production build -``` - -## 📦 Workspaces - -### Frontend (`frontend/`) - -React + Vite + TypeScript application with: -- Radix UI components -- Tailwind CSS styling -- React Router v6 -- React Intl (i18n) -- Zustand (state management) -- Vitest (testing) - -**Tech Stack:** -- Runtime: React 18.3 -- Build: Vite 6.0 -- Language: TypeScript 5.6 -- Styling: Tailwind CSS 3.4 - -### Documentation (`docs-site/`) - -Docusaurus 3.x documentation site with: -- 40+ command references -- 15 workflow guides -- Mermaid diagrams -- MDX support -- i18n (EN/ZH) - -**Tech Stack:** -- Framework: Docusaurus 3.5 -- Docs: MDX (Markdown + JSX) -- Diagrams: Mermaid -- Styling: Custom CSS with CCW theme - -## 🎨 Features - -- **40+ Commands**: workflow, issue, cli, memory, general categories -- **15 Workflow Levels**: From ultra-lightweight to intelligent orchestration -- **AI-Powered**: Multi-CLI collaboration with intelligent routing -- **Bilingual**: English and Chinese support -- **Themeable**: Light/dark mode with CCW design tokens -- **Interactive**: Mermaid workflow diagrams and live examples - -## 🔧 Configuration - -### Workspace Management - -Root `package.json` defines workspaces: - -```json -{ - "workspaces": [ - "frontend", - "docs-site" - ] -} -``` - -Dependencies are **hoisted** to root `node_modules` automatically by npm. - -### Adding Dependencies - -```bash -# Add to specific workspace -npm install --workspace=frontend -npm install --workspace=docs-site - -# Add to root (shared) -npm install -w . - -# Add as dev dependency -npm install --workspace=frontend --save-dev -``` - -## 📖 Documentation - -Full documentation is available at: -- **Development**: http://localhost:5173/docs -- **Standalone**: http://localhost:3001 (when `npm run dev:docs`) - -Documentation source files are in `docs-site/docs/`: -- `overview.mdx` - Getting started -- `commands/` - Command references by category -- `workflows/` - Workflow guides and levels -- `faq.mdx` - Frequently asked questions - -## 🌍 Internationalization - -- **Frontend**: `frontend/src/locales/{en,zh}/` -- **Docs**: `docs-site/i18n/zh/docusaurus-plugin-content-docs/current/` - -## 🧪 Testing - -```bash -# Unit tests -npm test - -# Coverage -npm run test:coverage - -# E2E tests -npm run test:e2e - -# E2E UI mode -npm run test:e2e:ui -``` - -## 📦 Building for Production - -```bash -# Build all workspaces -npm run build - -# Output directories: -# - frontend/dist/ -# - docs-site/build/ -``` - -## 🚢 Deployment - -### Frontend - -Deploy `frontend/dist/` to any static hosting service: -- Vercel, Netlify, AWS S3, etc. - -### Documentation - -Documentation is integrated as `/docs` route in the frontend. -For standalone deployment, deploy `docs-site/build/`. - -### Nginx Configuration Example - -```nginx -server { - listen 80; - server_name ccw.example.com; - - # Frontend (with docs proxy) - location / { - root /var/www/ccw/frontend/dist; - try_files $uri $uri/ /index.html; - } - - # Fallback: standalone docs - location /docs { - root /var/www/ccw/docs-site/build; - try_files $uri $uri/ /docs/index.html; - } -} -``` - -## 🔗 Resources - -- [Docusaurus Documentation](https://docusaurus.io/) -- [Vite Documentation](https://vitejs.dev/) -- [React Documentation](https://react.dev/) -- [Tailwind CSS](https://tailwindcss.com/) -- [npm workspaces](https://docs.npmjs.com/cli/v9/using-npm/workspaces) - ---- - -**Built with ❤️ by the CCW Team** diff --git a/ccw/SMART_SEARCH_ANALYSIS.md b/ccw/SMART_SEARCH_ANALYSIS.md deleted file mode 100644 index fad60b3e..00000000 --- a/ccw/SMART_SEARCH_ANALYSIS.md +++ /dev/null @@ -1,167 +0,0 @@ -# Smart Search 索引分析报告 - -## 问题 -分析当前 `smart_search(action="init")` 是否进行了向量模型索引,还是仅进行了基础索引。 - -## 分析结果 - -### 1. Init 操作的默认行为 - -从代码分析来看,`smart_search(action="init")` 的行为如下: - -**代码路径**:`ccw/src/tools/smart-search.ts` → `ccw/src/tools/codex-lens.ts` - -```typescript -// smart-search.ts: executeInitAction (第 297-323 行) -async function executeInitAction(params: Params): Promise { - const { path = '.', languages } = params; - const args = ['init', path]; - if (languages && languages.length > 0) { - args.push('--languages', languages.join(',')); - } - const result = await executeCodexLens(args, { cwd: path, timeout: 300000 }); - // ... -} -``` - -**关键发现**: -- `smart_search(action="init")` 调用 `codexlens init` 命令 -- **不传递** `--no-embeddings` 参数 -- **不传递** `--embedding-model` 参数 - -### 2. CodexLens Init 的默认行为 - -根据 `codexlens init --help` 的输出: - -> If semantic search dependencies are installed, **automatically generates embeddings** after indexing completes. Use --no-embeddings to skip this step. - -**结论**: -- ✅ `init` 命令**默认会**生成 embeddings(如果安装了语义搜索依赖) -- ❌ 当前实现**未生成**所有文件的 embeddings - -### 3. 实际测试结果 - -#### 第一次 Init(未生成 embeddings) -```bash -$ smart_search(action="init", path="d:\\Claude_dms3\\ccw") -# 结果:索引了 303 个文件,但 vector_search: false -``` - -**原因分析**: -虽然语义搜索依赖(fastembed)已安装,但 init 过程中遇到警告: -``` -Warning: Embedding generation failed: Index already has 10 chunks. Use --force to regenerate. -``` - -#### 手动生成 Embeddings 后 -```bash -$ python -m codexlens embeddings-generate . --force --verbose - -Processing 5 files... -- D:\Claude_dms3\ccw\MCP_QUICKSTART.md: 1 chunks -- D:\Claude_dms3\ccw\MCP_SERVER.md: 2 chunks -- D:\Claude_dms3\ccw\README.md: 2 chunks -- D:\Claude_dms3\ccw\tailwind.config.js: 3 chunks -- D:\Claude_dms3\ccw\WRITE_FILE_FIX_SUMMARY.md: 2 chunks - -Total: 10 chunks, 5 files -Model: jinaai/jina-embeddings-v2-base-code (768 dimensions) -``` - -**关键发现**: -- ⚠️ 只为 **5 个文档/配置文件**生成了 embeddings -- ⚠️ **未为 298 个代码文件**(.ts, .js 等)生成 embeddings -- ✅ Embeddings 状态显示 `coverage_percent: 100.0`(但这是针对"应该生成 embeddings 的文件"而言) - -#### Hybrid Search 测试 -```bash -$ smart_search(query="authentication and authorization patterns", mode="hybrid") -# ✅ 成功返回 5 个结果,带有相似度分数 -# ✅ 证明向量搜索功能可用 -``` - -## 4. 索引类型对比 - -| 索引类型 | 当前状态 | 支持的文件 | 说明 | -|---------|---------|-----------|------| -| **Exact FTS** | ✅ 启用 | 所有 303 个文件 | 基于 SQLite FTS5 的全文搜索 | -| **Fuzzy FTS** | ❌ 未启用 | - | 模糊匹配搜索 | -| **Vector Search** | ⚠️ 部分启用 | 仅 5 个文档文件 | 基于 fastembed 的语义搜索 | -| **Hybrid Search** | ⚠️ 部分启用 | 仅 5 个文档文件 | RRF 融合(exact + fuzzy + vector) | - -## 5. 为什么只有 5 个文件有 Embeddings? - -**可能的原因**: - -1. **文件类型过滤**:CodexLens 可能只为文档文件(.md)和配置文件生成 embeddings -2. **代码文件使用符号索引**:代码文件(.ts, .js)可能依赖于符号提取而非文本 embeddings -3. **性能考虑**:生成 300+ 文件的 embeddings 需要大量时间和存储空间 - -## 6. 结论 - -### 当前 `smart_search(action="init")` 的行为: - -✅ **会尝试**生成向量索引(如果语义依赖已安装) -⚠️ **实际只**为文档/配置文件生成 embeddings(5/303 文件) -✅ **支持** hybrid 模式搜索(对于有 embeddings 的文件) -✅ **支持** exact 模式搜索(对于所有 303 个文件) - -### 搜索模式智能路由: - -``` -用户查询 → auto 模式 → 决策树: - ├─ 自然语言查询 + 有 embeddings → hybrid 模式(RRF 融合) - ├─ 简单查询 + 有索引 → exact 模式(FTS) - └─ 无索引 → ripgrep 模式(字面匹配) -``` - -## 7. 建议 - -### 如果需要完整的语义搜索支持: - -```bash -# 方案 1:检查是否所有代码文件都应该有 embeddings -python -m codexlens embeddings-status . --verbose - -# 方案 2:明确为代码文件生成 embeddings(如果支持) -# 需要查看 CodexLens 文档确认代码文件的语义索引策略 - -# 方案 3:使用 hybrid 模式进行文档搜索,exact 模式进行代码搜索 -smart_search(query="架构设计", mode="hybrid") # 文档语义搜索 -smart_search(query="function_name", mode="exact") # 代码精确搜索 -``` - -### 当前最佳实践: - -```javascript -// 1. 初始化索引(一次性) -smart_search(action="init", path=".") - -// 2. 智能搜索(推荐使用 auto 模式) -smart_search(query="your query") // 自动选择最佳模式 - -// 3. 特定模式搜索 -smart_search(query="natural language query", mode="hybrid") // 语义搜索 -smart_search(query="exact_identifier", mode="exact") // 精确匹配 -smart_search(query="quick literal", mode="ripgrep") // 快速字面搜索 -``` - -## 8. 技术细节 - -### Embeddings 模型 -- **模型**:jinaai/jina-embeddings-v2-base-code -- **维度**:768 -- **大小**:~150MB -- **后端**:fastembed (ONNX-based) - -### 索引存储 -- **位置**:`C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\ccw\_index.db` -- **大小**:122.57 MB -- **Schema 版本**:5 -- **文件数**:303 -- **目录数**:26 - ---- - -**生成时间**:2025-12-17 -**CodexLens 版本**:从当前安装中检测 diff --git a/ccw/SMART_SEARCH_CORRECTED_ANALYSIS.md b/ccw/SMART_SEARCH_CORRECTED_ANALYSIS.md deleted file mode 100644 index 6f5229bc..00000000 --- a/ccw/SMART_SEARCH_CORRECTED_ANALYSIS.md +++ /dev/null @@ -1,330 +0,0 @@ -# Smart Search 索引分析报告(修正版) - -## 用户质疑 - -1. ❓ 为什么不为代码文件生成向量 embeddings? -2. ❓ Exact FTS 和 Vector 索引内容应该一样才对 -3. ❓ init 应该返回 FTS 和 vector 索引概况 - -**结论:用户的质疑 100% 正确!这是 CodexLens 的设计缺陷。** - ---- - -## 真实情况 - -### 1. 分层索引架构 - -CodexLens 使用**分层目录索引**: - -``` -D:\Claude_dms3\ccw\ -├── _index.db ← 根目录索引(5个文件) -├── src/ -│ ├── _index.db ← src目录索引(2个文件) -│ ├── tools/ -│ │ └── _index.db ← tools子目录索引(25个文件) -│ └── ... -└── ... (总共 26 个 _index.db) -``` - -### 2. 索引覆盖情况 - -| 目录 | 文件数 | FTS索引 | Embeddings | -|------|--------|---------|------------| -| **根目录** | 5 | ✅ | ✅ (10 chunks) | -| bin/ | 2 | ✅ | ❌ 无semantic_chunks表 | -| dist/ | 4 | ✅ | ❌ 无semantic_chunks表 | -| dist/commands/ | 24 | ✅ | ❌ 无semantic_chunks表 | -| dist/tools/ | 50 | ✅ | ❌ 无semantic_chunks表 | -| src/tools/ | 25 | ✅ | ❌ 无semantic_chunks表 | -| src/commands/ | 12 | ✅ | ❌ 无semantic_chunks表 | -| ... | ... | ... | ... | -| **总计** | **303** | **✅ 100%** | **❌ 1.6%** (5/303) | - -### 3. 关键发现 - -```python -# 运行检查脚本的结果 -Total index databases: 26 -Directories with embeddings: 1 # ❌ 只有根目录! -Total files indexed: 303 # ✅ FTS索引完整 -Total semantic chunks: 10 # ❌ 只有根目录的5个文件 -``` - -**问题**: -- ✅ **所有303个文件**都有 FTS 索引(分布在26个_index.db中) -- ❌ **只有5个文件**(1.6%)有 vector embeddings -- ❌ **25个子目录**的_index.db根本没有`semantic_chunks`表结构 - ---- - -## 为什么会这样? - -### 原因分析 - -1. **`init` 操作**: - ```bash - codexlens init . - ``` - - ✅ 为所有303个文件创建 FTS 索引(分布式) - - ⚠️ 尝试生成 embeddings,但遇到"Index already has 10 chunks"警告 - - ❌ 只为根目录生成了 embeddings - -2. **`embeddings-generate` 操作**: - ```bash - codexlens embeddings-generate . --force - ``` - - ❌ 只处理了根目录的 _index.db - - ❌ **未递归处理子目录的索引** - - 结果:只有5个文档文件有 embeddings - -### 设计问题 - -**CodexLens 的 embeddings 架构有缺陷**: - -```python -# 期望行为 -for each _index.db in project: - generate_embeddings(index_db) - -# 实际行为 -generate_embeddings(root_index_db_only) -``` - ---- - -## Init 返回信息缺陷 - -### 当前 `init` 的返回 - -```json -{ - "success": true, - "message": "CodexLens index created successfully for d:\\Claude_dms3\\ccw" -} -``` - -**问题**: -- ❌ 没有说明索引了多少文件 -- ❌ 没有说明是否生成了 embeddings -- ❌ 没有说明 embeddings 覆盖率 - -### 应该返回的信息 - -```json -{ - "success": true, - "message": "Index created successfully", - "stats": { - "total_files": 303, - "total_directories": 26, - "index_databases": 26, - "fts_coverage": { - "files": 303, - "percentage": 100.0 - }, - "embeddings_coverage": { - "files": 5, - "chunks": 10, - "percentage": 1.6, - "warning": "Embeddings only generated for root directory. Run embeddings-generate on each subdir for full coverage." - }, - "features": { - "exact_fts": true, - "fuzzy_fts": false, - "vector_search": "partial" - } - } -} -``` - ---- - -## 解决方案 - -### 方案 1:递归生成 Embeddings(推荐) - -```bash -# 为所有子目录生成 embeddings -find .codexlens/indexes -name "_index.db" -exec \ - python -m codexlens embeddings-generate {} --force \; -``` - -### 方案 2:改进 Init 命令 - -```python -# codexlens/cli.py -def init_with_embeddings(project_root): - """Initialize with recursive embeddings generation""" - # 1. Build FTS indexes (current behavior) - build_indexes(project_root) - - # 2. Generate embeddings for ALL subdirs - for index_db in find_all_index_dbs(project_root): - if has_semantic_deps(): - generate_embeddings(index_db) - - # 3. Return comprehensive stats - return { - "fts_coverage": get_fts_stats(), - "embeddings_coverage": get_embeddings_stats(), - "features": detect_features() - } -``` - -### 方案 3:Smart Search 路由改进 - -```python -# 当前逻辑 -def classify_intent(query, hasIndex): - if not hasIndex: - return "ripgrep" - elif is_natural_language(query): - return "hybrid" # ❌ 但只有5个文件有embeddings! - else: - return "exact" - -# 改进逻辑 -def classify_intent(query, indexStatus): - embeddings_coverage = indexStatus.embeddings_coverage_percent - - if embeddings_coverage < 50: - # 如果覆盖率<50%,即使是自然语言也降级到exact - return "exact" if indexStatus.indexed else "ripgrep" - elif is_natural_language(query): - return "hybrid" - else: - return "exact" -``` - ---- - -## 验证用户质疑 - -### ❓ 为什么不为代码文件生成 embeddings? - -**答**:不是"不为代码文件生成",而是: -- ✅ 代码文件都有 FTS 索引 -- ❌ `embeddings-generate` 命令有BUG,**只处理根目录** -- ❌ 子目录的索引数据库甚至**没有创建 semantic_chunks 表** - -### ❓ FTS 和 Vector 应该索引相同内容 - -**答**:**完全正确!** 当前实际情况: -- FTS: 303/303 (100%) -- Vector: 5/303 (1.6%) - -**这是严重的不一致性,违背了设计原则。** - -### ❓ Init 应该返回索引概况 - -**答**:**完全正确!** 当前 init 只返回简单成功消息,应该返回: -- FTS 索引统计 -- Embeddings 覆盖率 -- 功能特性状态 -- 警告信息(如果覆盖不完整) - ---- - -## 测试验证 - -### Hybrid Search 的实际效果 - -```javascript -// 当前查询 -smart_search(query="authentication patterns", mode="hybrid") - -// 实际搜索范围: -// ✅ 可搜索的文件:5个(根目录的.md文件) -// ❌ 不可搜索的文件:298个代码文件 -// 结果:返回的都是文档文件,代码文件被忽略 -``` - -### 修复后的效果(理想状态) - -```javascript -// 修复后 -smart_search(query="authentication patterns", mode="hybrid") - -// 实际搜索范围: -// ✅ 可搜索的文件:303个(所有文件) -// 结果:包含代码文件和文档文件的综合结果 -``` - ---- - -## 建议的修复优先级 - -### P0 - 紧急修复 - -1. **修复 `embeddings-generate` 命令** - - 递归处理所有子目录的 _index.db - - 为每个 _index.db 创建 semantic_chunks 表 - -2. **改进 `init` 返回信息** - - 返回详细的索引统计 - - 显示 embeddings 覆盖率 - - 如果覆盖不完整,给出警告 - -### P1 - 重要改进 - -3. **Smart Search 自适应路由** - - 检查 embeddings 覆盖率 - - 如果覆盖率低,自动降级到 exact 模式 - -4. **Status 命令增强** - - 显示每个子目录的索引状态 - - 显示 embeddings 分布情况 - ---- - -## 临时解决方案 - -### 当前推荐使用方式 - -```javascript -// 1. 文档搜索 - 使用 hybrid(有embeddings) -smart_search(query="architecture design patterns", mode="hybrid") - -// 2. 代码搜索 - 使用 exact(无embeddings,但有FTS) -smart_search(query="function executeQuery", mode="exact") - -// 3. 快速搜索 - 使用 ripgrep(跨所有文件) -smart_search(query="TODO", mode="ripgrep") -``` - -### 完整覆盖的变通方案 - -```bash -# 手动为所有子目录生成 embeddings(如果CodexLens支持) -cd D:\Claude_dms3\ccw - -# 为每个子目录分别运行 -python -m codexlens embeddings-generate ./src/tools --force -python -m codexlens embeddings-generate ./src/commands --force -# ... 重复26次 - -# 或使用脚本自动化 -python check_embeddings.py --generate-all -``` - ---- - -## 总结 - -| 用户质疑 | 状态 | 结论 | -|---------|------|------| -| 为什么不对代码生成embeddings? | ✅ 正确 | 是BUG,不是设计 | -| FTS和Vector应该内容一致 | ✅ 正确 | 当前严重不一致 | -| Init应返回详细概况 | ✅ 正确 | 当前信息不足 | - -**用户的所有质疑都是正确的,揭示了 CodexLens 的三个核心问题:** - -1. **Embeddings 生成不完整**(只有1.6%覆盖率) -2. **索引一致性问题**(FTS vs Vector) -3. **返回信息不透明**(缺少统计数据) - ---- - -**生成时间**:2025-12-17 -**验证方法**:`python check_embeddings.py` diff --git a/codex-lens/_debug_output.txt b/codex-lens/_debug_output.txt deleted file mode 100644 index 69928b7a..00000000 --- a/codex-lens/_debug_output.txt +++ /dev/null @@ -1,19 +0,0 @@ - - Executing gemini (analysis mode)... - -Loaded cached credentials. -[STARTUP] StartupProfiler.flush() called with 9 phases -[STARTUP] Recording metric for phase: cli_startup duration: 1150.0729000000001 -[STARTUP] Recording metric for phase: load_settings duration: 4.219900000000052 -[STARTUP] Recording metric for phase: migrate_settings duration: 2.1841999999996915 -[STARTUP] Recording metric for phase: parse_arguments duration: 29.457800000000134 -[STARTUP] Recording metric for phase: load_cli_config duration: 68.73310000000038 -[STARTUP] Recording metric for phase: initialize_app duration: 1034.8242 -[STARTUP] Recording metric for phase: authenticate duration: 1029.4676 -[STARTUP] Recording metric for phase: discover_tools duration: 4.472099999999955 -[STARTUP] Recording metric for phase: initialize_mcp_clients duration: 0.6972999999998137 -Got it. I'm ready for your first command. - - ✓ Completed in 16.1s - ID: 1765691168543-gemini - Continue: ccw cli -p "..." --resume 1765691168543-gemini diff --git a/codex-lens/_test_output.txt b/codex-lens/_test_output.txt deleted file mode 100644 index 76696b00..00000000 --- a/codex-lens/_test_output.txt +++ /dev/null @@ -1,22 +0,0 @@ -=== STDOUT === - - Executing gemini (analysis mode)... - -Loaded cached credentials. -[STARTUP] StartupProfiler.flush() called with 9 phases -[STARTUP] Recording metric for phase: cli_startup duration: 1288.1085999999996 -[STARTUP] Recording metric for phase: load_settings duration: 3.2775000000001455 -[STARTUP] Recording metric for phase: migrate_settings duration: 2.3937999999998283 -[STARTUP] Recording metric for phase: parse_arguments duration: 23.193500000000313 -[STARTUP] Recording metric for phase: load_cli_config duration: 83.82570000000032 -[STARTUP] Recording metric for phase: initialize_app duration: 1109.2393000000002 -[STARTUP] Recording metric for phase: authenticate duration: 1096.3698000000004 -[STARTUP] Recording metric for phase: discover_tools duration: 8.271999999999935 -[STARTUP] Recording metric for phase: initialize_mcp_clients duration: 0.9225999999998749 -Setup complete. I am ready for your first command. - - ✓ Completed in 19.6s - ID: 1765690404300-gemini - Continue: ccw cli -p "..." --resume 1765690404300-gemini - -=== STDERR === diff --git a/codex-lens/_test_prompt.txt b/codex-lens/_test_prompt.txt deleted file mode 100644 index 6de22e04..00000000 --- a/codex-lens/_test_prompt.txt +++ /dev/null @@ -1,25 +0,0 @@ -PURPOSE: Generate semantic summaries and search keywords for code files -TASK: -- For each code block, generate a concise summary (1-2 sentences) -- Extract 5-10 relevant search keywords -- Identify the functional purpose/category -MODE: analysis -EXPECTED: JSON format output - -=== CODE BLOCKS === -[FILE: auth.py] -```python -def auth(): pass -``` - -=== OUTPUT FORMAT === -Return ONLY valid JSON (no markdown, no explanation): -{ - "files": { - "": { - "summary": "Brief description of what this code does", - "keywords": ["keyword1", "keyword2", ...], - "purpose": "category like: auth, api, util, ui, data, config, test" - } - } -} \ No newline at end of file diff --git a/codex-lens/_test_result.txt b/codex-lens/_test_result.txt deleted file mode 100644 index aab2e831..00000000 --- a/codex-lens/_test_result.txt +++ /dev/null @@ -1,19 +0,0 @@ - - Executing gemini (analysis mode)... - -Loaded cached credentials. -[STARTUP] StartupProfiler.flush() called with 9 phases -[STARTUP] Recording metric for phase: cli_startup duration: 1197.5227999999997 -[STARTUP] Recording metric for phase: load_settings duration: 2.119999999999891 -[STARTUP] Recording metric for phase: migrate_settings duration: 1.401600000000144 -[STARTUP] Recording metric for phase: parse_arguments duration: 18.296000000000276 -[STARTUP] Recording metric for phase: load_cli_config duration: 56.0604000000003 -[STARTUP] Recording metric for phase: initialize_app duration: 1109.9696999999996 -[STARTUP] Recording metric for phase: authenticate duration: 1104.0013 -[STARTUP] Recording metric for phase: discover_tools duration: 3.9744999999993524 -[STARTUP] Recording metric for phase: initialize_mcp_clients duration: 0.8747000000003027 -Setup complete. I am ready for your first command. - - ✓ Completed in 16.0s - ID: 1765690668720-gemini - Continue: ccw cli -p "..." --resume 1765690668720-gemini diff --git a/codex-lens/_tmp_search.json b/codex-lens/_tmp_search.json deleted file mode 100644 index b056d0c7..00000000 --- a/codex-lens/_tmp_search.json +++ /dev/null @@ -1,415 +0,0 @@ -{ - "success": true, - "result": { - "query": "class Config", - "method": "cascade", - "count": 50, - "results": [ - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\hybrid_search\\data_structures.py", - "score": 0.06081658330145309, - "excerpt": " @classmethod\n def from_dict(cls, data: Dict[str, Any]) -> \"CallHierarchyItem\":\n return cls(\n name=data[\"name\"],\n kind=data[\"kind\"],\n file_path=data[\"file...", - "content": " @classmethod\n def from_dict(cls, data: Dict[str, Any]) -> \"CallHierarchyItem\":\n return cls(\n name=data[\"name\"],\n kind=data[\"kind\"],\n file_path=data[\"file_path\"],\n range=Range.from_dict(data[\"range\"]),\n detail=data.get(\"detail\"),\n )\n\n\n@dataclass\nclass CodeSymbolNode:\n\n id: str\n name: str\n kind: str\n file_path: str\n range: Range\n embedding: Optional[List[float]] = None\n raw_code: str = \"\"\n docstring: str = \"\"\n score: float = 0.0\n\n def __post_init__(self) -> None:\n if not self.id:\n raise ValueError(\"id cannot be empty\")\n if not self.name:\n raise ValueError(\"name cannot be empty\")\n if not self.kind:\n raise ValueError(\"kind cannot be empty\")\n if not self.file_path:\n raise ValueError(\"file_path cannot be empty\")\n\n def __hash__(self) -> int:\n return hash(self.id)\n\n def __eq__(self, other: object) -> bool:\n if not isinstance(other, CodeSymbolNode):\n return False\n return self.id == other.id\n\n def to_dict(self) -> Dict[str, Any]:\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "score": 0.056576452190618645, - "excerpt": "from rich.table import Table\n\nfrom codexlens.config import Config\nfrom codexlens.entities import IndexedFile, SearchResult, Symbol\nfrom codexlens.errors import CodexLensError, ConfigError, ParseError,...", - "content": "import os\nimport shutil\nimport sqlite3\nfrom pathlib import Path\nfrom typing import Annotated, Any, Dict, Iterable, List, Optional\n\nimport typer\nfrom rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn\nfrom rich.table import Table\n\nfrom codexlens.config import Config\nfrom codexlens.entities import IndexedFile, SearchResult, Symbol\nfrom codexlens.errors import CodexLensError, ConfigError, ParseError, StorageError, SearchError\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore, ProjectInfo\nfrom codexlens.storage.index_tree import IndexTreeBuilder\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.search.chain_search import ChainSearchEngine, SearchOptions\nfrom codexlens.watcher import WatcherManager, WatcherConfig\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "score": 0.05655744432847353, - "excerpt": "\"\"\"Configuration system for CodexLens.\"\"\"\n\nfrom __future__ import annotations", - "content": "\"\"\"Configuration system for CodexLens.\"\"\"\n\nfrom __future__ import annotations\n\nimport json\nimport logging\nimport os\nfrom dataclasses import dataclass, field\nfrom functools import cached_property\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "score": 0.049219375000264694, - "excerpt": "\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CH...", - "content": "\"\"\"Chain search engine for recursive multi-directory searching.\n\nProvides parallel search across directory hierarchies using indexed _index.db files.\nSupports depth-limited traversal, result aggregation, and symbol search.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CHECKING\nimport json\nimport logging\nimport os\nimport time\n\nfrom codexlens.entities import SearchResult, Symbol\n\nif TYPE_CHECKING:", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\embedding.py", - "score": 0.047931429239828446, - "excerpt": " def __init__(\n self,\n model_name: Optional[str] = None,\n use_gpu: bool = True,\n expand_dim: bool = True,\n ) -> None:\n from codexlens.semantic import SEMANTIC_...", - "content": " def __init__(\n self,\n model_name: Optional[str] = None,\n use_gpu: bool = True,\n expand_dim: bool = True,\n ) -> None:\n from codexlens.semantic import SEMANTIC_AVAILABLE\n\n if not SEMANTIC_AVAILABLE:\n raise ImportError(\n \"Semantic search dependencies not available. \"\n \"Install with: pip install codexlens[semantic]\"\n )\n\n self._model_name = model_name or self.DEFAULT_MODEL\n self._use_gpu = use_gpu\n self._expand_dim = expand_dim\n self._model = None\n self._native_dim: Optional[int] = None\n\n \n self._expansion_matrix: Optional[np.ndarray] = None\n\n @property\n def model_name(self) -> str:\n return self._model_name\n\n @property\n def embedding_dim(self) -> int:\n if self._expand_dim:\n return self.TARGET_DIM\n \n if self._native_dim is not None:\n return self._native_dim\n \n model_dims = {\n \"BAAI/bge-large-en-v1.5\": 1024,\n \"BAAI/bge-base-en-v1.5\": 768,\n \"BAAI/bge-small-en-v1.5\": 384,\n \"intfloat/multilingual-e5-large\": 1024,\n }\n return model_dims.get(self._model_name, 1024)\n\n @property\n def max_tokens(self) -> int:\n return 512 \n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "score": 0.04283104206542711, - "excerpt": "import threading\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom typing import Any, Dict, Iterable, List, Optional", - "content": "Provides intelligent load balancing across multiple LiteLLM embedding endpoints\nto maximize throughput while respecting rate limits.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport random\nimport threading\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom typing import Any, Dict, Iterable, List, Optional\n\nimport numpy as np\n\nfrom .base import BaseEmbedder\n\nlogger = logging.getLogger(__name__)\n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "score": 0.036886112765573215, - "excerpt": "- Direct subprocess spawning of language servers\n- JSON-RPC 2.0 communication over stdin/stdout\n- Multi-language support via configuration file (lsp-servers.json)\n- Process lifecycle management with a...", - "content": "\"\"\"Standalone Language Server Manager for direct LSP communication.\n\nThis module provides direct communication with language servers via JSON-RPC over stdio,\neliminating the need for VSCode Bridge. Similar to cclsp architecture.\n\nFeatures:\n- Direct subprocess spawning of language servers\n- JSON-RPC 2.0 communication over stdin/stdout\n- Multi-language support via configuration file (lsp-servers.json)\n- Process lifecycle management with auto-restart\n- Compatible interface with existing LspBridge\n\"\"\"\n\nfrom __future__ import annotations\n\nimport asyncio\nimport json\nimport logging\nimport os", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\models.py", - "score": 0.03457410829143062, - "excerpt": " container: Containing class/module (if any)\n score: Match score for ranking\n return {k: v for k, v in asdict(self).items() if v is not None}\n\n\n# =================================...", - "content": " container: Containing class/module (if any)\n score: Match score for ranking\n return {k: v for k, v in asdict(self).items() if v is not None}\n\n\n# =============================================================================\n# Section 4.4: find_references dataclasses\n# =============================================================================\n\n@dataclass\nclass ReferenceResult:\n file_path: str\n line: int\n column: int\n context_line: str\n relationship: str # call | import | type_annotation | inheritance\n\n def to_dict(self) -> dict:\n return asdict(self)\n\n\n@dataclass\nclass GroupedReferences:\n definition: DefinitionResult\n references: List[ReferenceResult] = field(default_factory=list)\n\n def to_dict(self) -> dict:\n return {\n \"definition\": self.definition.to_dict(),\n \"references\": [r.to_dict() for r in self.references],\n }\n\n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py", - "score": 0.03341093379138448, - "excerpt": "\n if TREE_SITTER_AVAILABLE:\n self._initialize_parser()\n\n def _initialize_parser(self) -> None:\n if TreeSitterParser is None or TreeSitterLanguage is None:\n retur...", - "content": "\n if TREE_SITTER_AVAILABLE:\n self._initialize_parser()\n\n def _initialize_parser(self) -> None:\n if TreeSitterParser is None or TreeSitterLanguage is None:\n return\n\n try:\n \n if self.language_id == \"python\":\n import tree_sitter_python\n self._language = TreeSitterLanguage(tree_sitter_python.language())\n elif self.language_id == \"javascript\":\n import tree_sitter_javascript\n self._language = TreeSitterLanguage(tree_sitter_javascript.language())\n elif self.language_id == \"typescript\":\n import tree_sitter_typescript\n \n if self.path is not None and self.path.suffix.lower() == \".tsx\":\n self._language = TreeSitterLanguage(tree_sitter_typescript.language_tsx())\n else:\n self._language = TreeSitterLanguage(tree_sitter_typescript.language_typescript())\n else:\n return\n\n \n self._parser = TreeSitterParser()\n if hasattr(self._parser, \"set_language\"):\n self._parser.set_language(self._language) \n else:\n self._parser.language = self._language \n\n except Exception:\n \n self._parser = None\n self._language = None\n\n def is_available(self) -> bool:\n return self._parser is not None and self._language is not None\n\n def _parse_tree(self, text: str) -> Optional[tuple[bytes, TreeSitterNode]]:\n if not self.is_available() or self._parser is None:\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py", - "score": 0.029568673189485736, - "excerpt": "\nimport logging\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import List, Optional", - "content": "\"\"\"Incremental indexer for processing file changes.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import List, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.storage.global_index import GlobalSymbolIndex\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py", - "score": 0.029334400167733504, - "excerpt": "\nfrom collections import defaultdict\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Literal\n", - "content": "\nUse cases:\n- Prioritize commonly called methods/functions\n- Filter out one-off results that may be less relevant\n- Deduplicate results pointing to the same symbol from different locations\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom collections import defaultdict\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Literal\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\n@dataclass", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\association_tree\\data_structures.py", - "score": 0.027925539288870704, - "excerpt": "\n def __len__(self) -> int:\n return len(self.all_nodes)\n\n def __repr__(self) -> str:\n return (\n f\"CallTree(roots={len(self.roots)}, nodes={len(self.all_nodes)}, \"\n ...", - "content": "\n def __len__(self) -> int:\n return len(self.all_nodes)\n\n def __repr__(self) -> str:\n return (\n f\"CallTree(roots={len(self.roots)}, nodes={len(self.all_nodes)}, \"\n f\"depth={self.depth_reached})\"\n )\n\n\n@dataclass\nclass UniqueNode:\n\n file_path: str\n name: str\n kind: str\n range: Range\n min_depth: int = 0\n occurrences: int = 1\n paths: List[List[str]] = field(default_factory=list)\n context_nodes: List[str] = field(default_factory=list)\n score: float = 0.0\n\n @property\n def node_key(self) -> tuple[str, int, int]:\n return (\n self.file_path,\n self.range.start_line,\n self.range.end_line,\n )\n\n def add_path(self, path: List[str]) -> None:\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "score": 0.024369821963687643, - "excerpt": "def _get_configured_index_root() -> Path:\n \"\"\"Get the index root from environment or config file.\n", - "content": "def _get_configured_index_root() -> Path:\n \"\"\"Get the index root from environment or config file.\n\n Priority order:\n 1. CODEXLENS_INDEX_DIR environment variable\n 2. index_dir from ~/.codexlens/config.json\n 3. Default: ~/.codexlens/indexes\n \"\"\"\n env_override = os.getenv(\"CODEXLENS_INDEX_DIR\")\n if env_override:\n return Path(env_override).expanduser().resolve()\n\n config_file = Path.home() / \".codexlens\" / \"config.json\"\n if config_file.exists():\n try:\n cfg = json.loads(config_file.read_text(encoding=\"utf-8\"))\n if \"index_dir\" in cfg:\n return Path(cfg[\"index_dir\"]).expanduser().resolve()\n except (json.JSONDecodeError, OSError):\n pass\n\n return Path.home() / \".codexlens\" / \"indexes\"", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "score": 0.023949795081967214, - "excerpt": "class VectorStore:\n \"\"\"SQLite-based vector storage with HNSW-accelerated similarity search.\n", - "content": "class VectorStore:\n \"\"\"SQLite-based vector storage with HNSW-accelerated similarity search.\n\n Performance optimizations:\n - HNSW index for O(log N) approximate nearest neighbor search\n - Embedding matrix cached in memory for batch similarity computation (fallback)\n - NumPy vectorized operations instead of Python loops (fallback)\n - Lazy content loading - only fetch full content for top-k results\n - Thread-safe cache invalidation\n - Bulk insert mode for efficient batch operations\n \"\"\"\n\n # Default embedding dimension (used when creating new index)\n DEFAULT_DIM = 768\n\n def __init__(self, db_path: str | Path) -> None:\n if not NUMPY_AVAILABLE:\n raise ImportError(\n \"Semantic search dependencies not available. \"\n \"Install with: pip install codexlens[semantic]\"\n )\n\n self.db_path = Path(db_path)\n self.db_path.parent.mkdir(parents=True, exist_ok=True)\n\n # Embedding cache for fast similarity search (fallback)\n self._cache_lock = threading.RLock()\n self._embedding_matrix: Optional[np.ndarray] = None\n self._embedding_norms: Optional[np.ndarray] = None\n self._chunk_ids: Optional[List[int]] = None\n self._cache_version: int = 0\n\n # ANN index for O(log N) search\n self._ann_index: Optional[ANNIndex] = None\n self._ann_dim: Optional[int] = None\n self._ann_write_lock = threading.Lock() # Protects ANN index modifications\n\n # Bulk insert mode tracking\n self._bulk_insert_mode: bool = False\n self._bulk_insert_ids: List[int] = []\n self._bulk_insert_embeddings: List[np.ndarray] = []\n\n self._init_schema()\n self._init_ann_index()\n\n def _init_schema(self) -> None:\n \"\"\"Initialize vector storage schema.\"\"\"\n with sqlite3.connect(self.db_path) as conn:\n # Enable memory mapping for faster reads\n conn.execute(\"PRAGMA mmap_size = 30000000000\") # 30GB limit\n conn.execute(\"\"\"\n CREATE TABLE IF NOT EXISTS semantic_chunks (\n id INTEGER PRIMARY KEY AUTOINCREMENT,\n file_path TEXT NOT NULL,\n content TEXT NOT NULL,\n embedding BLOB NOT NULL,\n metadata TEXT,\n category TEXT DEFAULT 'code',\n created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n )\n \"\"\")\n conn.execute(\"\"\"\n CREATE INDEX IF NOT EXISTS idx_chunks_file\n ON semantic_chunks(file_path)\n \"\"\")\n conn.execute(\"\"\"\n CREATE INDEX IF NOT EXISTS idx_chunks_category\n ON semantic_chunks(category)\n \"\"\")\n # Model configuration table - tracks which model generated the embeddings\n conn.execute(\"\"\"\n CREATE TABLE IF NOT EXISTS embeddings_config (\n id INTEGER PRIMARY KEY CHECK (id = 1),\n model_profile TEXT NOT NULL,\n model_name TEXT NOT NULL,\n embedding_dim INTEGER NOT NULL,\n backend TEXT NOT NULL DEFAULT 'fastembed',\n created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n )\n \"\"\")\n\n # Migration: Add backend column to existing tables\n self._migrate_backend_column(conn)\n # Migration: Add category column\n self._migrate_category_column(conn)\n\n conn.commit()\n\n def _migrate_backend_column(self, conn: sqlite3.Connection) -> None:\n \"\"\"Add backend column to existing embeddings_config table if not present.\n\n Args:\n conn: Active SQLite connection\n \"\"\"\n # Check if backend column exists\n cursor = conn.execute(\"PRAGMA table_info(embeddings_config)\")\n columns = [row[1] for row in cursor.fetchall()]\n\n if 'backend' not in columns:\n logger.info(\"Migrating embeddings_config table: adding backend column\")\n conn.execute(\"\"\"\n ALTER TABLE embeddings_config\n ADD COLUMN backend TEXT NOT NULL DEFAULT 'fastembed'\n \"\"\")\n\n def _migrate_category_column(self, conn: sqlite3.Connection) -> None:\n \"\"\"Add category column to existing semantic_chunks table if not present.\n\n Args:\n conn: Active SQLite connection\n \"\"\"\n # Check if category column exists\n cursor = conn.execute(\"PRAGMA table_info(semantic_chunks)\")\n columns = [row[1] for row in cursor.fetchall()]\n\n if 'category' not in columns:\n logger.info(\"Migrating semantic_chunks table: adding category column\")\n conn.execute(\"\"\"\n ALTER TABLE semantic_chunks\n ADD COLUMN category TEXT DEFAULT 'code'\n \"\"\")\n # Create index for fast category filtering\n conn.execute(\"\"\"\n CREATE INDEX IF NOT EXISTS idx_chunks_category\n ON semantic_chunks(category)\n \"\"\")\n\n def _init_ann_index(self) -> None:\n \"\"\"Initialize ANN index (lazy loading from existing data).\"\"\"\n if not HNSWLIB_AVAILABLE:\n logger.debug(\"hnswlib not available, using brute-force search\")\n return\n\n # Try to detect embedding dimension from existing data\n dim = self._detect_embedding_dim()\n if dim is None:\n # No data yet, will initialize on first add\n logger.debug(\"No embeddings found, ANN index will be created on first add\")\n return\n\n self._ann_dim = dim\n\n try:\n self._ann_index = ANNIndex(self.db_path, dim)\n if self._ann_index.load():\n logger.debug(\n \"Loaded ANN index with %d vectors\", self._ann_index.count()\n )\n else:\n # Index file doesn't exist, try to build from SQLite data\n logger.debug(\"ANN index file not found, rebuilding from SQLite\")\n self._rebuild_ann_index_internal()\n except Exception as e:\n logger.warning(\"Failed to initialize ANN index: %s\", e)\n self._ann_index = None\n\n def _detect_embedding_dim(self) -> Optional[int]:\n \"\"\"Detect embedding dimension from existing data.\"\"\"\n with sqlite3.connect(self.db_path) as conn:\n row = conn.execute(\n \"SELECT embedding FROM semantic_chunks LIMIT 1\"\n ).fetchone()\n if row and row[0]:\n # Embedding is stored as float32 blob\n blob = row[0]\n return len(blob) // np.dtype(np.float32).itemsize\n return None\n\n @property\n def dimension(self) -> Optional[int]:\n \"\"\"Return the dimension of embeddings in the store.\n\n Returns:\n Embedding dimension if available, None if store is empty.\n \"\"\"\n if self._ann_dim is not None:\n return self._ann_dim\n self._ann_dim = self._detect_embedding_dim()\n return self._ann_dim\n\n def _rebuild_ann_index_internal(self) -> int:\n \"\"\"Internal method to rebuild ANN index from SQLite data.\"\"\"\n if self._ann_index is None:\n return 0\n\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\"PRAGMA mmap_size = 30000000000\")\n rows = conn.execute(\n \"SELECT id, embedding FROM semantic_chunks\"\n ).fetchall()\n\n if not rows:\n return 0\n\n # Extract IDs and embeddings\n ids = [r[0] for r in rows]\n embeddings = np.vstack([\n np.frombuffer(r[1], dtype=np.float32) for r in rows\n ])\n\n # Add to ANN index\n self._ann_index.add_vectors(ids, embeddings)\n self._ann_index.save()\n\n logger.info(\"Rebuilt ANN index with %d vectors\", len(ids))\n return len(ids)\n\n def rebuild_ann_index(self) -> int:\n \"\"\"Rebuild HNSW index from all chunks in SQLite.\n\n Use this method to:\n - Migrate existing data to use ANN search\n - Repair corrupted index\n - Reclaim space after many deletions\n\n Returns:\n Number of vectors indexed.\n \"\"\"\n if not HNSWLIB_AVAILABLE:\n logger.warning(\"hnswlib not available, cannot rebuild ANN index\")\n return 0\n\n # Detect dimension\n dim = self._detect_embedding_dim()\n if dim is None:\n logger.warning(\"No embeddings found, cannot rebuild ANN index\")\n return 0\n\n self._ann_dim = dim\n\n # Create new index\n try:\n self._ann_index = ANNIndex(self.db_path, dim)\n return self._rebuild_ann_index_internal()\n except Exception as e:\n logger.error(\"Failed to rebuild ANN index: %s\", e)\n self._ann_index = None\n return 0\n\n def _invalidate_cache(self) -> None:\n \"\"\"Invalidate the embedding cache (thread-safe).\"\"\"\n with self._cache_lock:\n self._embedding_matrix = None\n self._embedding_norms = None\n self._chunk_ids = None\n self._cache_version += 1\n\n def _refresh_cache(self) -> bool:\n \"\"\"Load embeddings into numpy matrix for fast similarity search.\n\n Returns:\n True if cache was refreshed successfully, False if no data.\n \"\"\"\n with self._cache_lock:\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\"PRAGMA mmap_size = 30000000000\")\n rows = conn.execute(\n \"SELECT id, embedding FROM semantic_chunks\"\n ).fetchall()\n\n if not rows:\n self._embedding_matrix = None\n self._embedding_norms = None\n self._chunk_ids = None\n return False\n\n # Extract IDs and embeddings\n self._chunk_ids = [r[0] for r in rows]\n\n # Bulk convert binary blobs to numpy matrix\n embeddings = [\n np.frombuffer(r[1], dtype=np.float32) for r in rows\n ]\n self._embedding_matrix = np.vstack(embeddings)\n\n # Pre-compute norms for faster similarity calculation\n self._embedding_norms = np.linalg.norm(\n self._embedding_matrix, axis=1, keepdims=True\n )\n # Avoid division by zero\n self._embedding_norms = np.where(\n self._embedding_norms == 0, EPSILON, self._embedding_norms\n )\n\n return True\n\n def _ensure_ann_index(self, dim: int) -> bool:\n \"\"\"Ensure ANN index is initialized with correct dimension.\n\n This method is thread-safe and uses double-checked locking.\n\n Args:\n dim: Embedding dimension\n\n Returns:\n True if ANN index is ready, False otherwise\n \"\"\"\n if not HNSWLIB_AVAILABLE:\n return False\n\n # Fast path: index already initialized (no lock needed)\n if self._ann_index is not None:\n return True\n\n # Slow path: acquire lock for initialization\n with self._ann_write_lock:\n # Double-check after acquiring lock\n if self._ann_index is not None:\n return True\n\n try:\n self._ann_dim = dim\n self._ann_index = ANNIndex(self.db_path, dim)\n self._ann_index.load() # Try to load existing\n return True\n except Exception as e:\n logger.warning(\"Failed to initialize ANN index: %s\", e)\n self._ann_index = None\n return False\n\n def add_chunk(\n self, chunk: SemanticChunk, file_path: str, category: str = \"code\"\n ) -> int:\n \"\"\"Add a single chunk with its embedding.\n\n Args:\n chunk: SemanticChunk with embedding\n file_path: Path to the source file\n category: File category ('code' or 'doc'), default 'code'\n\n Returns:\n The inserted chunk ID.\n \"\"\"\n if chunk.embedding is None:\n raise ValueError(\"Chunk must have embedding before adding to store\")\n\n embedding_arr = np.array(chunk.embedding, dtype=np.float32)\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n\n with sqlite3.connect(self.db_path) as conn:\n cursor = conn.execute(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n (file_path, chunk.content, embedding_blob, metadata_json, category)\n )\n conn.commit()\n chunk_id = cursor.lastrowid or 0\n\n # Add to ANN index\n if self._ensure_ann_index(len(chunk.embedding)):\n with self._ann_write_lock:\n try:\n self._ann_index.add_vectors([chunk_id], embedding_arr.reshape(1, -1))\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return chunk_id\n\n def add_chunks(\n self, chunks: List[SemanticChunk], file_path: str, category: str = \"code\"\n ) -> List[int]:\n \"\"\"Add multiple chunks with embeddings (batch insert).\n\n Args:\n chunks: List of SemanticChunk objects with embeddings\n file_path: Path to the source file\n category: File category ('code' or 'doc'), default 'code'\n\n Returns:\n List of inserted chunk IDs.\n \"\"\"\n if not chunks:\n return []\n\n # Prepare batch data\n batch_data = []\n embeddings_list = []\n for chunk in chunks:\n if chunk.embedding is None:\n raise ValueError(\"All chunks must have embeddings\")\n embedding_arr = np.array(chunk.embedding, dtype=np.float32)\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))\n embeddings_list.append(embedding_arr)\n\n # Batch insert to SQLite\n with sqlite3.connect(self.db_path) as conn:\n # Get starting ID before insert\n row = conn.execute(\"SELECT MAX(id) FROM semantic_chunks\").fetchone()\n start_id = (row[0] or 0) + 1\n\n conn.executemany(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n batch_data\n )\n conn.commit()\n # Calculate inserted IDs based on starting ID\n ids = list(range(start_id, start_id + len(chunks)))\n\n # Add to ANN index\n if embeddings_list and self._ensure_ann_index(len(embeddings_list[0])):\n with self._ann_write_lock:\n try:\n embeddings_matrix = np.vstack(embeddings_list)\n self._ann_index.add_vectors(ids, embeddings_matrix)\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add batch to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return ids\n\n def add_chunks_batch(\n self,\n chunks_with_paths: List[Tuple[SemanticChunk, str]],\n update_ann: bool = True,\n auto_save_ann: bool = True,\n categories: Optional[List[str]] = None,\n ) -> List[int]:\n \"\"\"Batch insert chunks from multiple files in a single transaction.\n\n This method is optimized for bulk operations during index generation.\n\n Args:\n chunks_with_paths: List of (chunk, file_path) tuples\n update_ann: If True, update ANN index with new vectors (default: True)\n auto_save_ann: If True, save ANN index after update (default: True).\n Set to False for bulk inserts to reduce I/O overhead.\n categories: Optional list of categories per chunk. If None, defaults to 'code'.\n If provided, must match length of chunks_with_paths.\n\n Returns:\n List of inserted chunk IDs\n \"\"\"\n if not chunks_with_paths:\n return []\n\n batch_size = len(chunks_with_paths)\n\n # Validate categories if provided\n if categories is not None and len(categories) != batch_size:\n raise ValueError(\n f\"categories length ({len(categories)}) must match \"\n f\"chunks_with_paths length ({batch_size})\"\n )\n\n # Prepare batch data\n batch_data = []\n embeddings_list = []\n for i, (chunk, file_path) in enumerate(chunks_with_paths):\n if chunk.embedding is None:\n raise ValueError(\"All chunks must have embeddings\")\n # Optimize: avoid repeated np.array() if already numpy\n if isinstance(chunk.embedding, np.ndarray):\n embedding_arr = chunk.embedding.astype(np.float32)\n else:\n embedding_arr = np.array(chunk.embedding, dtype=np.float32)\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n category = categories[i] if categories else \"code\"\n batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))\n embeddings_list.append(embedding_arr)\n\n # Batch insert to SQLite in single transaction\n with sqlite3.connect(self.db_path) as conn:\n # Get starting ID before insert\n row = conn.execute(\"SELECT MAX(id) FROM semantic_chunks\").fetchone()\n start_id = (row[0] or 0) + 1\n\n _validate_chunk_id_range(start_id, batch_size)\n\n conn.executemany(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n batch_data\n )\n conn.commit()\n # Calculate inserted IDs based on starting ID\n ids = list(range(start_id, start_id + batch_size))\n\n # Handle ANN index updates\n if embeddings_list and update_ann and self._ensure_ann_index(len(embeddings_list[0])):\n with self._ann_write_lock:\n # In bulk insert mode, accumulate for later batch update\n if self._bulk_insert_mode:\n self._bulk_insert_ids.extend(ids)\n self._bulk_insert_embeddings.extend(embeddings_list)\n else:\n # Normal mode: update immediately\n try:\n embeddings_matrix = np.vstack(embeddings_list)\n self._ann_index.add_vectors(ids, embeddings_matrix)\n if auto_save_ann:\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add batch to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return ids\n\n def add_chunks_batch_numpy(\n self,\n chunks_with_paths: List[Tuple[SemanticChunk, str]],\n embeddings_matrix: np.ndarray,\n update_ann: bool = True,\n auto_save_ann: bool = True,\n categories: Optional[List[str]] = None,\n ) -> List[int]:\n \"\"\"Batch insert chunks with pre-computed numpy embeddings matrix.\n\n This method accepts embeddings as a numpy matrix to avoid list->array conversions.\n Useful when embeddings are already in numpy format from batch encoding.\n\n Args:\n chunks_with_paths: List of (chunk, file_path) tuples (embeddings can be None)\n embeddings_matrix: Pre-computed embeddings as (N, D) numpy array\n update_ann: If True, update ANN index with new vectors (default: True)\n auto_save_ann: If True, save ANN index after update (default: True)\n categories: Optional list of categories per chunk. If None, defaults to 'code'.\n\n Returns:\n List of inserted chunk IDs\n \"\"\"\n if not chunks_with_paths:\n return []\n\n batch_size = len(chunks_with_paths)\n\n if len(chunks_with_paths) != embeddings_matrix.shape[0]:\n raise ValueError(\n f\"Mismatch: {len(chunks_with_paths)} chunks but \"\n f\"{embeddings_matrix.shape[0]} embeddings\"\n )\n\n # Validate categories if provided\n if categories is not None and len(categories) != batch_size:\n raise ValueError(\n f\"categories length ({len(categories)}) must match \"\n f\"chunks_with_paths length ({batch_size})\"\n )\n\n # Ensure float32 format\n embeddings_matrix = embeddings_matrix.astype(np.float32)\n\n # Prepare batch data\n batch_data = []\n for i, (chunk, file_path) in enumerate(chunks_with_paths):\n embedding_arr = embeddings_matrix[i]\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n category = categories[i] if categories else \"code\"\n batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))\n\n # Batch insert to SQLite in single transaction\n with sqlite3.connect(self.db_path) as conn:\n # Get starting ID before insert\n row = conn.execute(\"SELECT MAX(id) FROM semantic_chunks\").fetchone()\n start_id = (row[0] or 0) + 1\n\n _validate_chunk_id_range(start_id, batch_size)\n\n conn.executemany(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n batch_data\n )\n conn.commit()\n # Calculate inserted IDs based on starting ID\n ids = list(range(start_id, start_id + batch_size))\n\n # Handle ANN index updates\n if update_ann and self._ensure_ann_index(embeddings_matrix.shape[1]):\n with self._ann_write_lock:\n # In bulk insert mode, accumulate for later batch update\n if self._bulk_insert_mode:\n self._bulk_insert_ids.extend(ids)\n # Split matrix into individual arrays for accumulation\n self._bulk_insert_embeddings.extend([embeddings_matrix[i] for i in range(len(ids))])\n else:\n # Normal mode: update immediately\n try:\n self._ann_index.add_vectors(ids, embeddings_matrix)\n if auto_save_ann:\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add batch to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return ids\n\n def begin_bulk_insert(self) -> None:\n \"\"\"Begin bulk insert mode - disable ANN auto-update for better performance.\n\n Usage:\n store.begin_bulk_insert()\n try:\n for batch in batches:\n store.add_chunks_batch(batch, auto_save_ann=False)\n finally:\n store.end_bulk_insert()\n\n Or use context manager:\n with store.bulk_insert():\n for batch in batches:\n store.add_chunks_batch(batch)\n \"\"\"\n with self._ann_write_lock:\n self._bulk_insert_mode = True\n self._bulk_insert_ids.clear()\n self._bulk_insert_embeddings.clear()\n logger.debug(\"Entered bulk insert mode\")\n\n def end_bulk_insert(self) -> None:\n \"\"\"End bulk insert mode and rebuild ANN index from accumulated data.\n\n This method should be called after all bulk inserts are complete to\n update the ANN index in a single batch operation.\n \"\"\"\n with self._ann_write_lock:\n if not self._bulk_insert_mode:\n logger.warning(\"end_bulk_insert called but not in bulk insert mode\")\n return\n\n self._bulk_insert_mode = False\n bulk_ids = list(self._bulk_insert_ids)\n bulk_embeddings = list(self._bulk_insert_embeddings)\n self._bulk_insert_ids.clear()\n self._bulk_insert_embeddings.clear()\n\n # Update ANN index with accumulated data.\n if bulk_ids and bulk_embeddings:\n if self._ensure_ann_index(len(bulk_embeddings[0])):\n with self._ann_write_lock:\n try:\n embeddings_matrix = np.vstack(bulk_embeddings)\n self._ann_index.add_vectors(bulk_ids, embeddings_matrix)\n self._ann_index.save()\n logger.info(\n \"Bulk insert complete: added %d vectors to ANN index\",\n len(bulk_ids),\n )\n except Exception as e:\n logger.error(\"Failed to update ANN index after bulk insert: %s\", e)\n\n logger.debug(\"Exited bulk insert mode\")\n\n class BulkInsertContext:\n \"\"\"Context manager for bulk insert operations.\"\"\"\n\n def __init__(self, store: \"VectorStore\") -> None:\n self.store = store\n\n def __enter__(self) -> \"VectorStore\":\n self.store.begin_bulk_insert()\n return self.store\n\n def __exit__(self, exc_type, exc_val, exc_tb) -> None:\n self.store.end_bulk_insert()\n\n def bulk_insert(self) -> \"VectorStore.BulkInsertContext\":\n \"\"\"Return a context manager for bulk insert operations.\n\n Usage:\n with store.bulk_insert():\n for batch in batches:\n store.add_chunks_batch(batch)\n \"\"\"\n return self.BulkInsertContext(self)\n\n def delete_file_chunks(self, file_path: str) -> int:\n \"\"\"Delete all chunks for a file.\n\n Returns:\n Number of deleted chunks.\n \"\"\"\n # Get chunk IDs before deletion (for ANN index)\n chunk_ids_to_delete = []\n if self._ann_index is not None:\n with sqlite3.connect(self.db_path) as conn:\n rows = conn.execute(\n \"SELECT id FROM semantic_chunks WHERE file_path = ?\",\n (file_path,)\n ).fetchall()\n chunk_ids_to_delete = [r[0] for r in rows]\n\n # Delete from SQLite\n with sqlite3.connect(self.db_path) as conn:\n cursor = conn.execute(\n \"DELETE FROM semantic_chunks WHERE file_path = ?\",\n (file_path,)\n )\n conn.commit()\n deleted = cursor.rowcount\n\n # Remove from ANN index\n if deleted > 0 and self._ann_index is not None and chunk_ids_to_delete:\n with self._ann_write_lock:\n try:\n self._ann_index.remove_vectors(chunk_ids_to_delete)\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to remove from ANN index: %s\", e)\n\n if deleted > 0:\n self._invalidate_cache()\n return deleted\n\n def search_similar(\n self,\n query_embedding: List[float],\n top_k: int = 10,\n min_score: float = 0.0,\n return_full_content: bool = True,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Find chunks most similar to query embedding.\n\n Uses HNSW index for O(log N) search when available, falls back to\n brute-force NumPy search otherwise.\n\n Args:\n query_embedding: Query vector.\n top_k: Maximum results to return.\n min_score: Minimum cosine similarity score in [0.0, 1.0].\n return_full_content: If True, return full code block content.\n category: Optional category filter ('code' or 'doc'). If None, returns all.\n\n Returns:\n List of SearchResult ordered by similarity (highest first).\n \"\"\"\n query_vec = np.array(query_embedding, dtype=np.float32)\n\n if not 0.0 <= min_score <= 1.0:\n raise ValueError(\n f\"Invalid min_score: {min_score}. Must be within [0.0, 1.0] for cosine similarity.\"\n )\n\n # Try HNSW search first (O(log N))\n if (\n HNSWLIB_AVAILABLE\n and self._ann_index is not None\n and self._ann_index.is_loaded\n and self._ann_index.count() > 0\n ):\n try:\n return self._search_with_ann(\n query_vec, top_k, min_score, return_full_content, category\n )\n except Exception as e:\n logger.warning(\"ANN search failed, falling back to brute-force: %s\", e)\n\n # Fallback to brute-force search (O(N))\n return self._search_brute_force(\n query_vec, top_k, min_score, return_full_content, category\n )\n\n def _search_with_ann(\n self,\n query_vec: np.ndarray,\n top_k: int,\n min_score: float,\n return_full_content: bool,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Search using HNSW index (O(log N)).\n\n Args:\n query_vec: Query vector as numpy array\n top_k: Maximum results to return\n min_score: Minimum cosine similarity score in [0.0, 1.0]\n return_full_content: If True, return full code block content\n category: Optional category filter ('code' or 'doc')\n\n Returns:\n List of SearchResult ordered by similarity (highest first)\n \"\"\"\n # Limit top_k to available vectors to prevent hnswlib error\n ann_count = self._ann_index.count()\n # When category filtering, fetch more candidates to compensate for filtering\n fetch_k = top_k * 3 if category else top_k\n effective_top_k = min(fetch_k, ann_count) if ann_count > 0 else 0\n\n if effective_top_k == 0:\n return []\n\n # HNSW search returns (ids, distances)\n # For cosine space: distance = 1 - similarity\n ids, distances = self._ann_index.search(query_vec, effective_top_k)\n\n if ids is None or distances is None:\n logger.debug(\n \"ANN search returned null results (ids=%s, distances=%s)\",\n ids,\n distances,\n )\n return []\n\n if len(ids) == 0 or len(distances) == 0:\n logger.debug(\n \"ANN search returned empty results (ids=%s, distances=%s)\",\n ids,\n distances,\n )\n return []\n\n if len(ids) != len(distances):\n logger.warning(\n \"ANN search returned mismatched result lengths (%d ids, %d distances)\",\n len(ids),\n len(distances),\n )\n return []\n\n # Convert distances to similarity scores\n scores = [1.0 - d for d in distances]\n\n # Filter by min_score\n filtered = [\n (chunk_id, score)\n for chunk_id, score in zip(ids, scores)\n if score >= min_score\n ]\n\n if not filtered:\n return []\n\n top_ids = [f[0] for f in filtered]\n top_scores = [f[1] for f in filtered]\n\n # Fetch content from SQLite with category filtering\n results = self._fetch_results_by_ids(\n top_ids, top_scores, return_full_content, category\n )\n # Apply final limit after category filtering\n return results[:top_k]\n\n def _search_brute_force(\n self,\n query_vec: np.ndarray,\n top_k: int,\n min_score: float,\n return_full_content: bool,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Brute-force search using NumPy (O(N) fallback).\n\n Args:\n query_vec: Query vector as numpy array\n top_k: Maximum results to return\n min_score: Minimum cosine similarity score in [0.0, 1.0]\n return_full_content: If True, return full code block content\n category: Optional category filter ('code' or 'doc')\n\n Returns:\n List of SearchResult ordered by similarity (highest first)\n \"\"\"\n logger.warning(\n \"Using brute-force vector search (hnswlib not available). \"\n \"This may cause high memory usage for large indexes. \"\n \"Install hnswlib for better performance: pip install hnswlib\"\n )\n\n with self._cache_lock:\n # Refresh cache if needed\n if self._embedding_matrix is None:\n if not self._refresh_cache():\n return [] # No data\n\n # Vectorized cosine similarity\n query_vec = query_vec.reshape(1, -1)\n query_norm = np.linalg.norm(query_vec)\n if query_norm == 0:\n return []\n\n # Compute all similarities at once: (N,) scores\n # similarity = (A @ B.T) / (||A|| * ||B||)\n dot_products = np.dot(self._embedding_matrix, query_vec.T).flatten()\n scores = dot_products / (self._embedding_norms.flatten() * query_norm)\n\n # Filter by min_score and get top-k indices\n valid_mask = scores >= min_score\n valid_indices = np.where(valid_mask)[0]\n\n if len(valid_indices) == 0:\n return []\n\n # When category filtering, fetch more candidates to compensate for filtering\n fetch_k = top_k * 3 if category else top_k\n\n # Sort by score descending and take top candidates\n valid_scores = scores[valid_indices]\n sorted_order = np.argsort(valid_scores)[::-1][:fetch_k]\n top_indices = valid_indices[sorted_order]\n top_scores = valid_scores[sorted_order]\n\n # Get chunk IDs for top results\n top_ids = [self._chunk_ids[i] for i in top_indices]\n\n # Fetch content only for top-k results (lazy loading) with category filtering\n results = self._fetch_results_by_ids(\n top_ids, top_scores.tolist(), return_full_content, category\n )\n # Apply final limit after category filtering\n return results[:top_k]\n\n def _fetch_results_by_ids(\n self,\n chunk_ids: List[int],\n scores: List[float],\n return_full_content: bool,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Fetch full result data for specific chunk IDs.\n\n Args:\n chunk_ids: List of chunk IDs to fetch.\n scores: Corresponding similarity scores.\n return_full_content: Whether to include full content.\n category: Optional category filter ('code' or 'doc').\n\n Returns:\n List of SearchResult objects.\n \"\"\"\n if not chunk_ids:\n return []\n\n # Build parameterized query for IN clause\n placeholders = \",\".join(\"?\" * len(chunk_ids))\n _validate_sql_placeholders(placeholders, len(chunk_ids))\n\n # SQL injection prevention:\n # - Only a validated placeholders string (commas + '?') is interpolated into the query.\n # - User-provided values are passed separately via sqlite3 parameters.\n # - Category filter is added as a separate parameter\n if category:\n query = \"\"\"\n SELECT id, file_path, content, metadata\n FROM semantic_chunks\n WHERE id IN ({placeholders}) AND category = ?\n \"\"\".format(placeholders=placeholders)\n params = list(chunk_ids) + [category]\n else:\n query = \"\"\"\n SELECT id, file_path, content, metadata\n FROM semantic_chunks\n WHERE id IN ({placeholders})\n \"\"\".format(placeholders=placeholders)\n params = chunk_ids\n\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\"PRAGMA mmap_size = 30000000000\")\n rows = conn.execute(query, params).fetchall()\n\n # Build ID -> row mapping\n id_to_row = {r[0]: r for r in rows}\n\n results = []\n for chunk_id, score in zip(chunk_ids, scores):\n row = id_to_row.get(chunk_id)\n if not row:\n continue\n\n _, file_path, content, metadata_json = row\n metadata = json.loads(metadata_json) if metadata_json else {}\n\n # Build excerpt (short preview)\n excerpt = content[:200] + \"...\" if len(content) > 200 else content\n\n # Extract symbol information from metadata\n symbol_name = metadata.get(\"symbol_name\")\n symbol_kind = metadata.get(\"symbol_kind\")\n start_line = metadata.get(\"start_line\")\n end_line = metadata.get(\"end_line\")\n\n # Build Symbol object if we have symbol info\n symbol = None\n if symbol_name and symbol_kind and start_line and end_line:\n try:\n from codexlens.entities import Symbol\n symbol = Symbol(\n name=symbol_name,\n kind=symbol_kind,\n range=(start_line, end_line)\n )\n except Exception:\n pass\n\n results.append(SearchResult(\n path=file_path,\n score=score,\n excerpt=excerpt,\n content=content if return_full_content else None,\n symbol=symbol,\n metadata=metadata,\n start_line=start_line,\n end_line=end_line,\n symbol_name=symbol_name,\n symbol_kind=symbol_kind,\n ))\n\n return results\n\n def count_chunks(self) -> int:\n \"\"\"Count total chunks in store.\"\"\"\n with sqlite3.connect(self.db_path) as conn:\n row = conn.execute(\"SELECT COUNT(*) FROM semantic_chunks\").fetchone()\n return row[0] if row else 0\n\n def get_all_chunks(self) -> List[SemanticChunk]:\n \"\"\"Get all chunks from the store.\n\n Returns:\n List of SemanticChunk objects with id and content.\n \"\"\"\n with sqlite3.connect(self.db_path) as conn:\n conn.row_factory = sqlite3.Row\n rows = conn.execute(\n \"SELECT id, file_path, content, metadata FROM semantic_chunks\"\n ).fetchall()\n\n chunks = []\n for row in rows:\n chunks.append(SemanticChunk(\n id=row[\"id\"],\n content=row[\"content\"],\n file_path=row[\"file_path\"],\n metadata=json.loads(row[\"metadata\"]) if row[\"metadata\"] else None,\n ))\n return chunks\n\n def clear_cache(self) -> None:\n \"\"\"Manually clear the embedding cache.\"\"\"\n self._invalidate_cache()\n\n @property\n def ann_available(self) -> bool:\n \"\"\"Check if ANN index is available and ready.\"\"\"\n return (\n HNSWLIB_AVAILABLE\n and self._ann_index is not None\n and self._ann_index.is_loaded\n )\n\n @property\n def ann_count(self) -> int:\n \"\"\"Get number of vectors in ANN index.\"\"\"\n if self._ann_index is not None:\n return self._ann_index.count()\n return 0\n\n def get_model_config(self) -> Optional[Dict[str, Any]]:\n \"\"\"Get the model configuration used for embeddings in this store.\n\n Returns:\n Dictionary with model_profile, model_name, embedding_dim, backend, or None if not set.\n \"\"\"\n with sqlite3.connect(self.db_path) as conn:\n row = conn.execute(\n \"SELECT model_profile, model_name, embedding_dim, backend, created_at, updated_at \"\n \"FROM embeddings_config WHERE id = 1\"\n ).fetchone()\n if row:\n return {\n \"model_profile\": row[0],\n \"model_name\": row[1],\n \"embedding_dim\": row[2],\n \"backend\": row[3],\n \"created_at\": row[4],\n \"updated_at\": row[5],\n }\n return None\n\n def set_model_config(\n self, model_profile: str, model_name: str, embedding_dim: int, backend: str = 'fastembed'\n ) -> None:\n \"\"\"Set the model configuration for embeddings in this store.\n\n This should be called when generating new embeddings. If a different\n model was previously used, this will update the configuration.\n\n Args:\n model_profile: Model profile name (fast, code, minilm, etc.)\n model_name: Full model name (e.g., jinaai/jina-embeddings-v2-base-code)\n embedding_dim: Embedding dimension (e.g., 768)\n backend: Backend used for embeddings (fastembed or litellm, default: fastembed)\n \"\"\"\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\n \"\"\"\n INSERT INTO embeddings_config (id, model_profile, model_name, embedding_dim, backend)\n VALUES (1, ?, ?, ?, ?)\n ON CONFLICT(id) DO UPDATE SET\n model_profile = excluded.model_profile,\n model_name = excluded.model_name,\n embedding_dim = excluded.embedding_dim,\n backend = excluded.backend,\n updated_at = CURRENT_TIMESTAMP\n \"\"\",\n (model_profile, model_name, embedding_dim, backend)\n )\n conn.commit()\n\n def check_model_compatibility(\n self, model_profile: str, model_name: str, embedding_dim: int\n ) -> Tuple[bool, Optional[str]]:\n \"\"\"Check if the given model is compatible with existing embeddings.\n\n Args:\n model_profile: Model profile to check\n model_name: Model name to check\n embedding_dim: Embedding dimension to check\n\n Returns:\n Tuple of (is_compatible, warning_message).\n is_compatible is True if no existing config or configs match.\n warning_message is a user-friendly message if incompatible.\n \"\"\"\n existing = self.get_model_config()\n if existing is None:\n return True, None\n\n # Check dimension first (most critical)\n if existing[\"embedding_dim\"] != embedding_dim:\n return False, (\n f\"Dimension mismatch: existing embeddings use {existing['embedding_dim']}d \"\n f\"({existing['model_profile']}), but requested model uses {embedding_dim}d \"\n f\"({model_profile}). Use --force to regenerate all embeddings.\"\n )\n\n # Check model (different models with same dimension may have different semantic spaces)\n if existing[\"model_profile\"] != model_profile:\n return False, (\n f\"Model mismatch: existing embeddings use '{existing['model_profile']}' \"\n f\"({existing['model_name']}), but requested '{model_profile}' \"\n f\"({model_name}). Use --force to regenerate all embeddings.\"\n )\n\n return True, None\n\n def close(self) -> None:\n \"\"\"Close the vector store and release resources.\n\n This ensures SQLite connections are closed and ANN index is cleared,\n allowing temporary files to be deleted on Windows.\n \"\"\"\n with self._cache_lock:\n self._embedding_matrix = None\n self._embedding_norms = None\n self._chunk_ids = None\n\n with self._ann_write_lock:\n self._ann_index = None\n\n def __enter__(self) -> \"VectorStore\":\n \"\"\"Context manager entry.\"\"\"\n return self\n\n def __exit__(self, exc_type, exc_val, exc_tb) -> None:\n \"\"\"Context manager exit - close resources.\"\"\"\n self.close()", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py", - "score": 0.02356190140431283, - "excerpt": "\"\"\"Base classes for clustering strategies in the hybrid search pipeline.\n\nThis module defines the abstract base class for clustering strategies used", - "content": "\"\"\"Base classes for clustering strategies in the hybrid search pipeline.\n\nThis module defines the abstract base class for clustering strategies used\nin the staged hybrid search pipeline. Strategies cluster search results\nbased on their embeddings and select representative results from each cluster.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom dataclasses import dataclass, field", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "score": 0.022717150737751757, - "excerpt": "\nimport re\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Protocol", - "content": "\"\"\"Parser factory for CodexLens.\n\nPython and JavaScript/TypeScript parsing use Tree-Sitter grammars when\navailable. Regex fallbacks are retained to preserve the existing parser\ninterface and behavior in minimal environments.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport re\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Protocol\n\nfrom codexlens.config import Config\nfrom codexlens.entities import CodeRelationship, IndexedFile, RelationshipType, Symbol\nfrom codexlens.parsers.treesitter_parser import TreeSitterSymbolParser\n\n\nclass Parser(Protocol):\n def parse(self, text: str, path: Path) -> IndexedFile: ...", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py", - "score": 0.022282698690396483, - "excerpt": "\"\"\"codex-lens LSP Server implementation using pygls.\n\nThis module provides the main Language Server class and entry point.\n\"\"\"\n", - "content": "\"\"\"codex-lens LSP Server implementation using pygls.\n\nThis module provides the main Language Server class and entry point.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport argparse\nimport logging\nimport sys\nfrom pathlib import Path\nfrom typing import Optional\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "score": 0.022258499170812605, - "excerpt": " logger.log(level, \"[TIMING] %s: %.2fms\", name, elapsed_ms)\n\nfrom codexlens.config import Config\nfrom codexlens.config import VECTORS_HNSW_NAME\nfrom codexlens.entities import SearchResult", - "content": " logger: Logger instance to use\n level: Logging level (default DEBUG)\n \"\"\"\n start = time.perf_counter()\n try:\n yield\n finally:\n elapsed_ms = (time.perf_counter() - start) * 1000\n logger.log(level, \"[TIMING] %s: %.2fms\", name, elapsed_ms)\n\nfrom codexlens.config import Config\nfrom codexlens.config import VECTORS_HNSW_NAME\nfrom codexlens.entities import SearchResult\nfrom codexlens.search.ranking import (\n DEFAULT_WEIGHTS,\n FTS_FALLBACK_WEIGHTS,\n QueryIntent,\n apply_symbol_boost,\n cross_encoder_rerank,\n detect_query_intent,\n filter_results_by_category,", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "score": 0.022204010428648113, - "excerpt": "import threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple", - "content": "\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport hashlib\nimport re\nimport sqlite3\nimport threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple\n\nfrom codexlens.config import Config\nfrom codexlens.entities import CodeRelationship, SearchResult, Symbol\nfrom codexlens.errors import StorageError\nfrom codexlens.storage.global_index import GlobalSymbolIndex\n\n\n@dataclass", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "score": 0.022191896701700627, - "excerpt": "from typing import Callable, List, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore", - "content": "from __future__ import annotations\n\nimport json\nimport logging\nimport signal\nimport threading\nimport time\nfrom pathlib import Path\nfrom typing import Callable, List, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore\n\nfrom .events import FileEvent, IndexResult, PendingQueueStatus, WatcherConfig, WatcherStats\nfrom .file_watcher import FileWatcher\nfrom .incremental_indexer import IncrementalIndexer\n\nlogger = logging.getLogger(__name__)\n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "score": 0.021943278996721462, - "excerpt": "\nThis module provides the semantic_search() function for combining\nvector, structural, and keyword search with configurable fusion strategies.\n\"\"\"\n", - "content": "\"\"\"Semantic search API with RRF fusion.\n\nThis module provides the semantic_search() function for combining\nvector, structural, and keyword search with configurable fusion strategies.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import List, Optional\n\nfrom .models import SemanticResult\nfrom .utils import resolve_project", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py", - "score": 0.021943278996721462, - "excerpt": "from watchdog.events import FileSystemEventHandler\n\nfrom .events import ChangeType, FileEvent, WatcherConfig, PendingQueueStatus\nfrom ..config import Config\n", - "content": "\nimport logging\nimport threading\nimport time\nfrom pathlib import Path\nfrom typing import Callable, Dict, List, Optional\n\nfrom watchdog.observers import Observer\nfrom watchdog.events import FileSystemEventHandler\n\nfrom .events import ChangeType, FileEvent, WatcherConfig, PendingQueueStatus\nfrom ..config import Config\n\nlogger = logging.getLogger(__name__)\n\n# Maximum queue size to prevent unbounded memory growth\n# When exceeded, forces immediate flush to avoid memory exhaustion\nMAX_QUEUE_SIZE = 50000\n\n\nclass _CodexLensHandler(FileSystemEventHandler):", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py", - "score": 0.02150910700179165, - "excerpt": "\n \n env_api_base = _get_env_with_fallback(\"RERANKER_API_BASE\", self._workspace_root)\n self.api_base = (api_base or env_api_base or defaults[\"api_base\"]).strip().rstrip(\"/\")\n ...", - "content": "\n \n env_api_base = _get_env_with_fallback(\"RERANKER_API_BASE\", self._workspace_root)\n self.api_base = (api_base or env_api_base or defaults[\"api_base\"]).strip().rstrip(\"/\")\n self.endpoint = defaults[\"endpoint\"]\n\n \n env_model = _get_env_with_fallback(\"RERANKER_MODEL\", self._workspace_root)\n self.model_name = (model_name or env_model or defaults[\"default_model\"]).strip()\n if not self.model_name:\n raise ValueError(\"model_name cannot be blank\")\n\n \n resolved_key = api_key or _get_env_with_fallback(env_api_key, self._workspace_root) or \"\"\n resolved_key = resolved_key.strip()\n if not resolved_key:\n raise ValueError(\n f\"Missing API key for reranker provider '{self.provider}'. \"\n f\"Pass api_key=... or set ${env_api_key}.\"\n )\n self._api_key = resolved_key\n\n self.timeout_s = float(timeout) if timeout and float(timeout) > 0 else 30.0\n self.max_retries = int(max_retries) if max_retries and int(max_retries) >= 0 else 3\n self.backoff_base_s = float(backoff_base_s) if backoff_base_s and float(backoff_base_s) > 0 else 0.5\n self.backoff_max_s = float(backoff_max_s) if backoff_max_s and float(backoff_max_s) > 0 else 8.0\n\n headers = {\n \"Authorization\": f\"Bearer {self._api_key}\",\n \"Content-Type\": \"application/json\",\n }\n if self.provider == \"cohere\":\n headers.setdefault(\"Cohere-Version\", \"2022-12-06\")\n\n self._client = httpx.Client(\n base_url=self.api_base,\n headers=headers,\n timeout=self.timeout_s,\n )\n\n \n if max_input_tokens is not None:\n self._max_input_tokens = max_input_tokens\n else:\n \n model_lower = self.model_name.lower()\n if '8b' in model_lower or 'large' in model_lower:\n self._max_input_tokens = 32768\n else:\n self._max_input_tokens = 8192\n\n @property\n def max_input_tokens(self) -> int:\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "score": 0.02051605801605802, - "excerpt": " locked_config = get_locked_model_config()\n\n if locked_config is None:\n return {\n \"is_locked\": False,\n \"has_conflict\": False,\n \"locked_config\": None,\n ...", - "content": " locked_config = get_locked_model_config()\n\n if locked_config is None:\n return {\n \"is_locked\": False,\n \"has_conflict\": False,\n \"locked_config\": None,\n \"target_config\": {\"backend\": target_backend, \"model\": target_model},\n }\n\n has_conflict = (\n locked_config[\"backend\"] != target_backend or\n locked_config[\"model\"] != target_model\n )\n\n return {\n \"is_locked\": True,\n \"has_conflict\": has_conflict,\n \"locked_config\": locked_config,\n \"target_config\": {\"backend\": target_backend, \"model\": target_model},\n }", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\mcp\\provider.py", - "score": 0.020229904287875303, - "excerpt": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Optional, List, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import (\n MCPContext,\n SymbolInfo,\n Re...", - "content": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Optional, List, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import (\n MCPContext,\n SymbolInfo,\n ReferenceInfo,\n RelatedSymbol,\n)\n\nif TYPE_CHECKING:\n from codexlens.storage.global_index import GlobalSymbolIndex\n from codexlens.storage.registry import RegistryStore\n from codexlens.search.chain_search import ChainSearchEngine\n\nlogger = logging.getLogger(__name__)\n\n\nclass MCPProvider:\n\n def __init__(\n self,\n global_index: \"GlobalSymbolIndex\",\n search_engine: \"ChainSearchEngine\",\n registry: \"RegistryStore\",\n ) -> None:\n self.global_index = global_index\n self.search_engine = search_engine\n self.registry = registry\n\n def build_context(\n self,\n symbol_name: str,\n context_type: str = \"symbol_explanation\",\n include_references: bool = True,\n include_related: bool = True,\n max_references: int = 10,\n ) -> Optional[MCPContext]:\n \n symbols = self.global_index.search(symbol_name, prefix_mode=False, limit=1)\n\n if not symbols:\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\mcp\\hooks.py", - "score": 0.020007053720837744, - "excerpt": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Any, Dict, Optional, Callable, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import MCPContext\n\nif TYPE_CHECK...", - "content": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Any, Dict, Optional, Callable, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import MCPContext\n\nif TYPE_CHECKING:\n from codexlens.mcp.provider import MCPProvider\n\nlogger = logging.getLogger(__name__)\n\n\nclass HookManager:\n\n def __init__(self, mcp_provider: \"MCPProvider\") -> None:\n self.mcp_provider = mcp_provider\n self._pre_hooks: Dict[str, Callable] = {}\n self._post_hooks: Dict[str, Callable] = {}\n\n \n self._register_default_hooks()\n\n def _register_default_hooks(self) -> None:\n self._pre_hooks[\"explain\"] = self._pre_explain_hook\n self._pre_hooks[\"refactor\"] = self._pre_refactor_hook\n self._pre_hooks[\"document\"] = self._pre_document_hook\n\n def execute_pre_hook(\n self,\n action: str,\n params: Dict[str, Any],\n ) -> Optional[MCPContext]:\n hook = self._pre_hooks.get(action)\n\n if not hook:\n logger.debug(f\"No pre-hook for action: {action}\")\n return None\n\n try:\n return hook(params)\n except Exception as e:\n logger.error(f\"Pre-hook failed for {action}: {e}\")\n return None\n\n def execute_post_hook(\n self,\n action: str,\n result: Any,\n ) -> None:\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\association_tree\\deduplicator.py", - "score": 0.019921615989390927, - "excerpt": "\nfrom __future__ import annotations\n\nimport logging\nfrom typing import Dict, List, Optional\n\nfrom .data_structures import (\n CallTree,\n TreeNode,\n UniqueNode,\n)\n\nlogger = logging.getLogger(__...", - "content": "\nfrom __future__ import annotations\n\nimport logging\nfrom typing import Dict, List, Optional\n\nfrom .data_structures import (\n CallTree,\n TreeNode,\n UniqueNode,\n)\n\nlogger = logging.getLogger(__name__)\n\n\n\nKIND_WEIGHTS: Dict[str, float] = {\n \n \"function\": 1.0,\n \"method\": 1.0,\n \"12\": 1.0, \n \"6\": 1.0, \n \n \"class\": 0.8,\n \"5\": 0.8, \n \n \"interface\": 0.7,\n \"11\": 0.7, \n \"type\": 0.6,\n \n \"constructor\": 0.9,\n \"9\": 0.9, \n \n \"variable\": 0.4,\n \"13\": 0.4, \n \"constant\": 0.5,\n \"14\": 0.5, \n \n \"unknown\": 0.3,\n}\n\n\nclass ResultDeduplicator:\n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\factory.py", - "score": 0.01962803701934137, - "excerpt": " if not ok:\n raise ImportError(err)\n\n from .api_reranker import APIReranker\n\n _ = device \n resolved_model_name = (model_name or \"\").strip() or None\n retu...", - "content": " if not ok:\n raise ImportError(err)\n\n from .api_reranker import APIReranker\n\n _ = device \n resolved_model_name = (model_name or \"\").strip() or None\n return APIReranker(model_name=resolved_model_name, **kwargs)\n\n raise ValueError(\n f\"Unknown backend: {backend}. Supported backends: 'fastembed', 'onnx', 'api', 'litellm', 'legacy'\"\n )\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "score": 0.015740967294674172, - "excerpt": "import time\nfrom concurrent.futures import ProcessPoolExecutor, as_completed\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set, Tuple", - "content": "\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport os\nimport re\nimport sqlite3\nimport time\nfrom concurrent.futures import ProcessPoolExecutor, as_completed\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set, Tuple\n\nfrom codexlens.config import Config\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.storage.global_index import GlobalSymbolIndex\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import ProjectInfo, RegistryStore\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "score": 0.01569458021070924, - "excerpt": "\nLightweight Mode:\n The ChunkConfig supports a `skip_token_count` option for performance optimization.\n When enabled, token counting uses a fast character-based estimation (char/4)\n instead o...", - "content": "\"\"\"Code chunking strategies for semantic search.\n\nThis module provides various chunking strategies for breaking down source code\ninto semantic chunks suitable for embedding and search.\n\nLightweight Mode:\n The ChunkConfig supports a `skip_token_count` option for performance optimization.\n When enabled, token counting uses a fast character-based estimation (char/4)\n instead of expensive tiktoken encoding.\n\n Use cases for lightweight mode:\n - Large-scale indexing where speed is critical\n - Scenarios where approximate token counts are acceptable\n - Memory-constrained environments\n - Initial prototyping and development\n\n Example:", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py", - "score": 0.015496521189120809, - "excerpt": "from typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:", - "content": "\"\"\"No-op clustering strategy for search results.\n\nNoOpStrategy returns all results ungrouped when clustering dependencies\nare not available or clustering is disabled.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\nclass NoOpStrategy(BaseClusteringStrategy):\n \"\"\"No-op clustering strategy that returns all results ungrouped.\n\n This strategy is used as a final fallback when no clustering dependencies", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py", - "score": 0.014896214896214899, - "excerpt": "from typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:", - "content": "\"\"\"DBSCAN-based clustering strategy for search results.\n\nDBSCAN (Density-Based Spatial Clustering of Applications with Noise)\nis the fallback clustering strategy when HDBSCAN is not available.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\nclass DBSCANStrategy(BaseClusteringStrategy):\n \"\"\"DBSCAN-based clustering strategy.\n\n Uses sklearn's DBSCAN algorithm as a fallback when HDBSCAN is not available.", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\mcp\\schema.py", - "score": 0.014112903225806453, - "excerpt": " definition: Optional[str] = None\n references: List[ReferenceInfo] = field(default_factory=list)\n related_symbols: List[RelatedSymbol] = field(default_factory=list)\n metadata: dict = field...", - "content": " definition: Optional[str] = None\n references: List[ReferenceInfo] = field(default_factory=list)\n related_symbols: List[RelatedSymbol] = field(default_factory=list)\n metadata: dict = field(default_factory=dict)\n\n def to_dict(self) -> dict:\n result = {\n \"version\": self.version,\n \"context_type\": self.context_type,\n \"metadata\": self.metadata,\n }\n\n if self.symbol:\n result[\"symbol\"] = self.symbol.to_dict()\n if self.definition:\n result[\"definition\"] = self.definition\n if self.references:\n result[\"references\"] = [r.to_dict() for r in self.references]\n if self.related_symbols:\n result[\"related_symbols\"] = [s.to_dict() for s in self.related_symbols]\n\n return result\n\n def to_json(self, indent: int = 2) -> str:\n return json.dumps(self.to_dict(), indent=indent)\n\n def to_prompt_injection(self) -> str:\n parts = [\"\"]\n\n if self.symbol:\n parts.append(f\"## Symbol: {self.symbol.name}\")\n parts.append(f\"Type: {self.symbol.kind}\")\n parts.append(f\"Location: {self.symbol.file_path}:{self.symbol.line_start}\")\n\n if self.definition:\n parts.append(\"\\n## Definition\")\n parts.append(f\"```\\n{self.definition}\\n```\")\n\n if self.references:\n parts.append(f\"\\n## References ({len(self.references)} found)\")\n for ref in self.references[:5]: \n parts.append(f\"- {ref.file_path}:{ref.line} ({ref.relationship_type})\")\n parts.append(f\" ```\\n {ref.context}\\n ```\")\n\n if self.related_symbols:\n parts.append(\"\\n## Related Symbols\")\n for sym in self.related_symbols[:10]: \n parts.append(f\"- {sym.name} ({sym.relationship})\")\n\n parts.append(\"\")\n return \"\\n\".join(parts)\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "score": 0.013999118165784833, - "excerpt": "\nclass LspBridge:\n \n DEFAULT_BRIDGE_URL = \"http://127.0.0.1:3457\"\n DEFAULT_TIMEOUT = 30.0 \n DEFAULT_CACHE_TTL = 300 \n DEFAULT_MAX_CACHE_SIZE = 1000 \n\n def __init__(\n self,\n...", - "content": "\nclass LspBridge:\n \n DEFAULT_BRIDGE_URL = \"http://127.0.0.1:3457\"\n DEFAULT_TIMEOUT = 30.0 \n DEFAULT_CACHE_TTL = 300 \n DEFAULT_MAX_CACHE_SIZE = 1000 \n\n def __init__(\n self,\n bridge_url: str = DEFAULT_BRIDGE_URL,\n timeout: float = DEFAULT_TIMEOUT,\n cache_ttl: int = DEFAULT_CACHE_TTL,\n max_cache_size: int = DEFAULT_MAX_CACHE_SIZE,\n use_vscode_bridge: bool = False,\n workspace_root: Optional[str] = None,\n config_file: Optional[str] = None,\n ):\n self.bridge_url = bridge_url\n self.timeout = timeout\n self.cache_ttl = cache_ttl\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\events.py", - "score": 0.013999118165784833, - "excerpt": "\nfrom __future__ import annotations\n\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom pathlib import Path\nfrom typing import List, Optional, Set\n\n\nclass ChangeType(Enum)...", - "content": "\nfrom __future__ import annotations\n\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom pathlib import Path\nfrom typing import List, Optional, Set\n\n\nclass ChangeType(Enum):\n CREATED = \"created\"\n MODIFIED = \"modified\"\n DELETED = \"deleted\"\n MOVED = \"moved\"\n\n\n@dataclass\nclass FileEvent:\n path: Path\n change_type: ChangeType\n timestamp: float\n old_path: Optional[Path] = None \n\n\n@dataclass\nclass WatcherConfig:\n debounce_ms: int = 60000 \n ignored_patterns: Set[str] = field(default_factory=lambda: {\n \n \".git\", \".svn\", \".hg\",\n \n \".venv\", \"venv\", \"env\", \"__pycache__\", \".pytest_cache\", \".mypy_cache\", \".ruff_cache\",\n \n \"node_modules\", \"bower_components\", \".npm\", \".yarn\",\n \n \"dist\", \"build\", \"out\", \"target\", \"bin\", \"obj\", \"_build\", \"coverage\", \"htmlcov\",\n \n \".idea\", \".vscode\", \".vs\", \".eclipse\",\n \n \".codexlens\",\n \n \".cache\", \".parcel-cache\", \".turbo\", \".next\", \".nuxt\",\n \n \"logs\", \"tmp\", \"temp\",\n })\n languages: Optional[List[str]] = None \n\n\n@dataclass\nclass PendingQueueStatus:\n file_count: int = 0\n files: List[str] = field(default_factory=list) \n countdown_seconds: int = 0\n last_event_time: Optional[float] = None\n\n\n@dataclass\nclass IndexResult:\n files_indexed: int = 0\n files_removed: int = 0\n symbols_added: int = 0\n symbols_removed: int = 0\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "score": 0.013902465515368743, - "excerpt": "\nfrom __future__ import annotations\n\nimport platform\nimport sqlite3\nimport threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional...", - "content": "\nfrom __future__ import annotations\n\nimport platform\nimport sqlite3\nimport threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional\n\nfrom codexlens.errors import StorageError\n\n\n@dataclass\nclass ProjectInfo:\n\n id: int\n source_root: Path\n index_root: Path\n created_at: float\n last_indexed: float\n total_files: int\n total_dirs: int\n status: str\n\n\n@dataclass\nclass DirMapping:\n\n id: int\n project_id: int\n source_path: Path\n index_path: Path\n depth: int\n files_count: int\n last_updated: float\n\n\nclass RegistryStore:\n\n DEFAULT_DB_PATH = Path.home() / \".codexlens\" / \"registry.db\"\n\n def __init__(self, db_path: Path | None = None) -> None:\n self.db_path = (db_path or self.DEFAULT_DB_PATH).resolve()\n self._lock = threading.RLock()\n self._local = threading.local()\n self._pool_lock = threading.Lock()\n self._pool: Dict[int, sqlite3.Connection] = {}\n self._pool_generation = 0\n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_001_normalize_keywords.py", - "score": 0.013678451178451179, - "excerpt": " PRIMARY KEY (file_id, keyword_id),\n FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,\n FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCAD...", - "content": " PRIMARY KEY (file_id, keyword_id),\n FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,\n FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE\n )\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "score": 0.013661202185792351, - "excerpt": " project_root: Project root directory path\n symbol_name: Name of the symbol to find references for\n symbol_kind: Optional symbol kind filter (e.g., 'function', 'class')\n in...", - "content": "def find_references(\n project_root: str,\n symbol_name: str,\n symbol_kind: Optional[str] = None,\n include_definition: bool = True,\n group_by_definition: bool = True,\n limit: int = 100,\n) -> List[GroupedReferences]:\n \"\"\"Find all reference locations for a symbol.\n\n Multi-definition case returns grouped results to resolve ambiguity.\n\n This function wraps ChainSearchEngine.search_references() and groups\n the results by definition location. Each GroupedReferences contains\n a definition and all references that point to it.\n\n Args:\n project_root: Project root directory path\n symbol_name: Name of the symbol to find references for\n symbol_kind: Optional symbol kind filter (e.g., 'function', 'class')\n include_definition: Whether to include the definition location\n in the result (default True)\n group_by_definition: Whether to group references by definition.\n If False, returns a single group with all references.\n (default True)\n limit: Maximum number of references to return (default 100)\n\n Returns:\n List of GroupedReferences. Each group contains:\n - definition: The DefinitionResult for this symbol definition\n - references: List of ReferenceResult pointing to this definition\n\n Raises:\n ValueError: If project_root does not exist or is not a directory\n\n Examples:\n >>> refs = find_references(\"/path/to/project\", \"authenticate\")\n >>> for group in refs:\n ... print(f\"Definition: {group.definition.file_path}:{group.definition.line}\")\n ... for ref in group.references:\n ... print(f\" Reference: {ref.file_path}:{ref.line} ({ref.relationship})\")\n\n Note:\n Reference relationship types are normalized:\n - 'calls' -> 'call'\n - 'imports' -> 'import'\n - 'inherits' -> 'inheritance'\n \"\"\"\n # Validate and resolve project root\n project_path = resolve_project(project_root)\n\n # Import here to avoid circular imports\n from codexlens.config import Config\n from codexlens.storage.registry import RegistryStore\n from codexlens.storage.path_mapper import PathMapper\n from codexlens.storage.global_index import GlobalSymbolIndex\n from codexlens.search.chain_search import ChainSearchEngine\n from codexlens.search.chain_search import ReferenceResult as RawReferenceResult\n from codexlens.entities import Symbol\n\n # Initialize infrastructure\n config = Config()\n registry = RegistryStore()\n mapper = PathMapper(config.index_dir)\n\n # Create chain search engine\n engine = ChainSearchEngine(registry, mapper, config=config)\n\n try:\n # Step 1: Find definitions for the symbol\n definitions: List[DefinitionResult] = []\n\n if include_definition or group_by_definition:\n # Search for symbol definitions\n symbols = engine.search_symbols(\n name=symbol_name,\n source_path=project_path,\n kind=symbol_kind,\n )\n\n # Convert Symbol to DefinitionResult\n for sym in symbols:\n # Only include exact name matches for definitions\n if sym.name != symbol_name:\n continue\n\n # Optionally filter by kind\n if symbol_kind and sym.kind != symbol_kind:\n continue\n\n definitions.append(DefinitionResult(\n name=sym.name,\n kind=sym.kind,\n file_path=sym.file or \"\",\n line=sym.range[0] if sym.range else 1,\n end_line=sym.range[1] if sym.range else 1,\n signature=None, # Not available from Symbol\n container=None, # Not available from Symbol\n score=1.0,\n ))\n\n # Step 2: Get all references using ChainSearchEngine\n raw_references = engine.search_references(\n symbol_name=symbol_name,\n source_path=project_path,\n depth=-1,\n limit=limit,\n )\n\n # Step 3: Transform raw references to API ReferenceResult\n api_references: List[ReferenceResult] = []\n for raw_ref in raw_references:\n api_ref = _transform_to_reference_result(raw_ref)\n api_references.append(api_ref)\n\n # Step 4: Group references by definition\n if group_by_definition and definitions:\n return _group_references_by_definition(\n definitions=definitions,\n references=api_references,\n include_definition=include_definition,\n )\n else:\n # Return single group with placeholder definition or first definition\n if definitions:\n definition = definitions[0]\n else:\n # Create placeholder definition when no definition found\n definition = DefinitionResult(\n name=symbol_name,\n kind=symbol_kind or \"unknown\",\n file_path=\"\",\n line=0,\n end_line=0,\n signature=None,\n container=None,\n score=0.0,\n )\n\n return [GroupedReferences(\n definition=definition,\n references=api_references,\n )]\n\n finally:\n engine.close()", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\env_config.py", - "score": 0.01359062143375869, - "excerpt": " \n return default\n\n\ndef get_api_config(\n prefix: str,\n *,\n workspace_root: Path | None = None,\n defaults: Dict[str, Any] | None = None,\n) -> Dict[str, Any]:\n defaults = defaults o...", - "content": " \n return default\n\n\ndef get_api_config(\n prefix: str,\n *,\n workspace_root: Path | None = None,\n defaults: Dict[str, Any] | None = None,\n) -> Dict[str, Any]:\n defaults = defaults or {}\n \n config: Dict[str, Any] = {}\n \n \n field_mapping = {\n \"api_key\": f\"{prefix}_API_KEY\",\n \"api_base\": f\"{prefix}_API_BASE\",\n \"model\": f\"{prefix}_MODEL\",\n \"provider\": f\"{prefix}_PROVIDER\",\n \"timeout\": f\"{prefix}_TIMEOUT\",\n }\n \n for field, env_key in field_mapping.items():\n value = get_env(env_key, workspace_root=workspace_root)\n if value is not None:\n \n if field == \"timeout\":\n try:\n config[field] = float(value)\n except ValueError:\n pass\n else:\n config[field] = value\n elif field in defaults:\n config[field] = defaults[field]\n \n return config\n\n\ndef generate_env_example() -> str:\n lines = [\n \"# CodexLens Environment Configuration\",\n \"# Copy this file to .codexlens/.env and fill in your values\",\n \"\",\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\__init__.py", - "score": 0.01359062143375869, - "excerpt": " \"FileContextResult\",\n \"DefinitionResult\",\n \"ReferenceResult\",\n \"GroupedReferences\",\n \"SymbolInfo\",\n \"HoverInfo\",\n \"SemanticResult\",\n \n \"resolve_project\",\n \"normalize_rel...", - "content": " \"FileContextResult\",\n \"DefinitionResult\",\n \"ReferenceResult\",\n \"GroupedReferences\",\n \"SymbolInfo\",\n \"HoverInfo\",\n \"SemanticResult\",\n \n \"resolve_project\",\n \"normalize_relationship_type\",\n \"rank_by_proximity\",\n \"rank_by_score\",\n \n \"find_definition\",\n \"workspace_symbols\",\n \"get_hover\",\n \"file_context\",\n \"find_references\",\n \"semantic_search\",\n]\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_005_cleanup_unused_fields.py", - "score": 0.013517665130568358, - "excerpt": "\nimport logging\nfrom sqlite3 import Connection\n\nlog = logging.getLogger(__name__)\n\n\ndef upgrade(db_conn: Connection):\n cursor = db_conn.cursor()\n\n \n log.info(\"Checking semantic_metadata.keywo...", - "content": "\nimport logging\nfrom sqlite3 import Connection\n\nlog = logging.getLogger(__name__)\n\n\ndef upgrade(db_conn: Connection):\n cursor = db_conn.cursor()\n\n \n log.info(\"Checking semantic_metadata.keywords column...\")\n\n cursor.execute(\n \"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'\"\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py", - "score": 0.013495897553868569, - "excerpt": " >>> \n >>> from codexlens.search.clustering import FrequencyConfig\n >>> freq_config = FrequencyConfig(min_frequency=2, group_by=\"symbol\")\n >>> strategy = get_strategy(\"freq...", - "content": " >>> \n >>> from codexlens.search.clustering import FrequencyConfig\n >>> freq_config = FrequencyConfig(min_frequency=2, group_by=\"symbol\")\n >>> strategy = get_strategy(\"frequency\", freq_config)\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\onnx_reranker.py", - "score": 0.013480392156862746, - "excerpt": "\n from optimum.onnxruntime import ORTModelForSequenceClassification\n from transformers import AutoTokenizer\n\n if self.providers is None:\n from ..gpu_sup...", - "content": "\n from optimum.onnxruntime import ORTModelForSequenceClassification\n from transformers import AutoTokenizer\n\n if self.providers is None:\n from ..gpu_support import get_optimal_providers\n\n \n self.providers = get_optimal_providers(\n use_gpu=self.use_gpu, with_device_options=True\n )\n\n \n \n model_kwargs: dict[str, Any] = {}\n try:\n params = signature(ORTModelForSequenceClassification.from_pretrained).parameters\n if \"providers\" in params:\n model_kwargs[\"providers\"] = self.providers\n elif \"provider\" in params:\n provider_name = \"CPUExecutionProvider\"\n if self.providers:\n first = self.providers[0]\n provider_name = first[0] if isinstance(first, tuple) else str(first)\n model_kwargs[\"provider\"] = provider_name\n except Exception:\n model_kwargs = {}\n\n try:\n self._model = ORTModelForSequenceClassification.from_pretrained(\n self.model_name,\n **model_kwargs,\n )\n except TypeError:\n \n self._model = ORTModelForSequenceClassification.from_pretrained(self.model_name)\n\n self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)\n\n \n input_names: set[str] | None = None\n for attr in (\"input_names\", \"model_input_names\"):\n names = getattr(self._model, attr, None)\n if isinstance(names, (list, tuple)) and names:\n input_names = {str(n) for n in names}\n break\n if input_names is None:\n try:\n session = getattr(self._model, \"model\", None)\n if session is not None and hasattr(session, \"get_inputs\"):\n input_names = {i.name for i in session.get_inputs()}\n except Exception:\n input_names = None\n self._model_input_names = input_names\n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "score": 0.013403880070546739, - "excerpt": " return True\n\n except Exception as e:\n raise StorageError(f\"Failed to load ANN index: {e}\")\n\n def count(self) -> int:\n with self._lock:\n r...", - "content": " return True\n\n except Exception as e:\n raise StorageError(f\"Failed to load ANN index: {e}\")\n\n def count(self) -> int:\n with self._lock:\n return self._current_count\n\n @property\n def capacity(self) -> int:\n with self._lock:\n return self._max_elements\n\n @property\n def usage_ratio(self) -> float:\n with self._lock:\n if self._max_elements == 0:\n return 0.0\n return self._current_count / self._max_elements\n\n @property\n def is_loaded(self) -> bool:\n with self._lock:\n return self._index is not None and self._current_count > 0\n\n\n\nclass BinaryANNIndex:\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\litellm_embedder.py", - "score": 0.01322751322751323, - "excerpt": "class LiteLLMEmbedderWrapper(BaseEmbedder):\n \"\"\"Wrapper for ccw-litellm LiteLLMEmbedder.\n", - "content": "class LiteLLMEmbedderWrapper(BaseEmbedder):\n \"\"\"Wrapper for ccw-litellm LiteLLMEmbedder.\n\n This wrapper adapts the ccw-litellm LiteLLMEmbedder to the CodexLens\n BaseEmbedder interface, enabling seamless integration with CodexLens\n semantic search functionality.\n\n Args:\n model: Model identifier for LiteLLM (default: \"default\")\n **kwargs: Additional arguments passed to LiteLLMEmbedder\n\n Raises:\n ImportError: If ccw-litellm package is not installed\n \"\"\"\n\n def __init__(self, model: str = \"default\", **kwargs) -> None:\n \"\"\"Initialize LiteLLM embedder wrapper.\n\n Args:\n model: Model identifier for LiteLLM (default: \"default\")\n **kwargs: Additional arguments passed to LiteLLMEmbedder\n\n Raises:\n ImportError: If ccw-litellm package is not installed\n \"\"\"\n try:\n from ccw_litellm import LiteLLMEmbedder\n self._embedder = LiteLLMEmbedder(model=model, **kwargs)\n except ImportError as e:\n raise ImportError(\n \"ccw-litellm not installed. Install with: pip install ccw-litellm\"\n ) from e\n\n @property\n def embedding_dim(self) -> int:\n \"\"\"Return embedding dimensions from LiteLLMEmbedder.\n\n Returns:\n int: Dimension of the embedding vectors.\n \"\"\"\n return self._embedder.dimensions\n\n @property\n def model_name(self) -> str:\n \"\"\"Return model name from LiteLLMEmbedder.\n\n Returns:\n str: Name or identifier of the underlying model.\n \"\"\"\n return self._embedder.model_name\n\n @property\n def max_tokens(self) -> int:\n \"\"\"Return maximum token limit for the embedding model.\n\n Returns:\n int: Maximum number of tokens that can be embedded at once.\n Reads from LiteLLM config's max_input_tokens property.\n \"\"\"\n # Get from LiteLLM embedder's max_input_tokens property (now exposed)\n if hasattr(self._embedder, 'max_input_tokens'):\n return self._embedder.max_input_tokens\n\n # Fallback: infer from model name\n model_name_lower = self.model_name.lower()\n\n # Large models (8B or \"large\" in name)\n if '8b' in model_name_lower or 'large' in model_name_lower:\n return 32768\n\n # OpenAI text-embedding-3-* models\n if 'text-embedding-3' in model_name_lower:\n return 8191\n\n # Default fallback\n return 8192\n\n def _sanitize_text(self, text: str) -> str:\n \"\"\"Sanitize text to work around ModelScope API routing bug.\n\n ModelScope incorrectly routes text starting with lowercase 'import'\n to an Ollama endpoint, causing failures. This adds a leading space\n to work around the issue without affecting embedding quality.\n\n Args:\n text: Text to sanitize.\n\n Returns:\n Sanitized text safe for embedding API.\n \"\"\"\n if text.startswith('import'):\n return ' ' + text\n return text\n\n def embed_to_numpy(self, texts: str | Iterable[str], **kwargs) -> np.ndarray:\n \"\"\"Embed texts to numpy array using LiteLLMEmbedder.\n\n Args:\n texts: Single text or iterable of texts to embed.\n **kwargs: Additional arguments (ignored for LiteLLM backend).\n Accepts batch_size for API compatibility with fastembed.\n\n Returns:\n numpy.ndarray: Array of shape (n_texts, embedding_dim) containing embeddings.\n \"\"\"\n if isinstance(texts, str):\n texts = [texts]\n else:\n texts = list(texts)\n\n # Sanitize texts to avoid ModelScope routing bug\n texts = [self._sanitize_text(t) for t in texts]\n\n # LiteLLM handles batching internally, ignore batch_size parameter\n return self._embedder.embed(texts)\n\n def embed_single(self, text: str) -> list[float]:\n \"\"\"Generate embedding for a single text.\n\n Args:\n text: Text to embed.\n\n Returns:\n list[float]: Embedding vector as a list of floats.\n \"\"\"\n # Sanitize text before embedding\n sanitized = self._sanitize_text(text)\n embedding = self._embedder.embed([sanitized])\n return embedding[0].tolist()", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\association_tree\\builder.py", - "score": 0.013083213083213086, - "excerpt": "\nfrom __future__ import annotations\n\nimport asyncio\nimport logging\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set\n\nfrom codexlens.hybrid_search.data_structures import CallHierar...", - "content": "\nfrom __future__ import annotations\n\nimport asyncio\nimport logging\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set\n\nfrom codexlens.hybrid_search.data_structures import CallHierarchyItem, Range\nfrom codexlens.lsp.standalone_manager import StandaloneLspManager\nfrom .data_structures import CallTree, TreeNode\n\nlogger = logging.getLogger(__name__)\n\n\nclass AssociationTreeBuilder:\n\n def __init__(\n self,\n lsp_manager: StandaloneLspManager,\n timeout: float = 5.0,\n ):\n self.lsp_manager = lsp_manager\n self.timeout = timeout\n self.visited: Set[str] = set()\n\n async def build_tree(\n self,\n seed_file_path: str,\n seed_line: int,\n seed_character: int = 1,\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py", - "score": 0.012885154061624651, - "excerpt": "import re\nimport sqlite3\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple\n\ntry:\n from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser\nexcept Exception:...", - "content": "import re\nimport sqlite3\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple\n\ntry:\n from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser\nexcept Exception: \n TreeSitterSymbolParser = None \n\n\nclass SymbolExtractor:\n\n \n PATTERNS = {\n 'python': {\n 'function': r'^(?:async\\s+)?def\\s+(\\w+)\\s*\\(',\n 'class': r'^class\\s+(\\w+)\\s*[:\\(]',\n 'import': r'^(?:from\\s+([\\w.]+)\\s+)?import\\s+([\\w.,\\s]+)',\n 'call': r'(? Tuple[str, Optional[str], Optional[int], Optional[int]]:\n return (result.path, result.symbol_name, result.start_line, result.end_line)\n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py", - "score": 0.008680555555555556, - "excerpt": "from typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:", - "content": "\"\"\"HDBSCAN-based clustering strategy for search results.\n\nHDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise)\nis the primary clustering strategy for grouping similar search results.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\nclass HDBSCANStrategy(BaseClusteringStrategy):\n \"\"\"HDBSCAN-based clustering strategy.\n\n Uses HDBSCAN algorithm to cluster search results based on embedding similarity.", - "source": null, - "symbol": null - } - ], - "stats": { - "dirs_searched": 17, - "files_matched": 50, - "time_ms": 6371.967315673828 - } - } -} diff --git a/codex-lens/_tmp_search2.json b/codex-lens/_tmp_search2.json deleted file mode 100644 index b2cd26ce..00000000 --- a/codex-lens/_tmp_search2.json +++ /dev/null @@ -1,415 +0,0 @@ -{ - "success": true, - "result": { - "query": "class Config", - "method": "cascade", - "count": 50, - "results": [ - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\hybrid_search\\data_structures.py", - "score": 0.06081658330145309, - "excerpt": " @classmethod\n def from_dict(cls, data: Dict[str, Any]) -> \"CallHierarchyItem\":\n return cls(\n name=data[\"name\"],\n kind=data[\"kind\"],\n file_path=data[\"file...", - "content": " @classmethod\n def from_dict(cls, data: Dict[str, Any]) -> \"CallHierarchyItem\":\n return cls(\n name=data[\"name\"],\n kind=data[\"kind\"],\n file_path=data[\"file_path\"],\n range=Range.from_dict(data[\"range\"]),\n detail=data.get(\"detail\"),\n )\n\n\n@dataclass\nclass CodeSymbolNode:\n\n id: str\n name: str\n kind: str\n file_path: str\n range: Range\n embedding: Optional[List[float]] = None\n raw_code: str = \"\"\n docstring: str = \"\"\n score: float = 0.0\n\n def __post_init__(self) -> None:\n if not self.id:\n raise ValueError(\"id cannot be empty\")\n if not self.name:\n raise ValueError(\"name cannot be empty\")\n if not self.kind:\n raise ValueError(\"kind cannot be empty\")\n if not self.file_path:\n raise ValueError(\"file_path cannot be empty\")\n\n def __hash__(self) -> int:\n return hash(self.id)\n\n def __eq__(self, other: object) -> bool:\n if not isinstance(other, CodeSymbolNode):\n return False\n return self.id == other.id\n\n def to_dict(self) -> Dict[str, Any]:\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "score": 0.056576452190618645, - "excerpt": "from rich.table import Table\n\nfrom codexlens.config import Config\nfrom codexlens.entities import IndexedFile, SearchResult, Symbol\nfrom codexlens.errors import CodexLensError, ConfigError, ParseError,...", - "content": "import os\nimport shutil\nimport sqlite3\nfrom pathlib import Path\nfrom typing import Annotated, Any, Dict, Iterable, List, Optional\n\nimport typer\nfrom rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn\nfrom rich.table import Table\n\nfrom codexlens.config import Config\nfrom codexlens.entities import IndexedFile, SearchResult, Symbol\nfrom codexlens.errors import CodexLensError, ConfigError, ParseError, StorageError, SearchError\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore, ProjectInfo\nfrom codexlens.storage.index_tree import IndexTreeBuilder\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.search.chain_search import ChainSearchEngine, SearchOptions\nfrom codexlens.watcher import WatcherManager, WatcherConfig\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "score": 0.05655744432847353, - "excerpt": "\"\"\"Configuration system for CodexLens.\"\"\"\n\nfrom __future__ import annotations", - "content": "\"\"\"Configuration system for CodexLens.\"\"\"\n\nfrom __future__ import annotations\n\nimport json\nimport logging\nimport os\nfrom dataclasses import dataclass, field\nfrom functools import cached_property\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "score": 0.049219375000264694, - "excerpt": "\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CH...", - "content": "\"\"\"Chain search engine for recursive multi-directory searching.\n\nProvides parallel search across directory hierarchies using indexed _index.db files.\nSupports depth-limited traversal, result aggregation, and symbol search.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CHECKING\nimport json\nimport logging\nimport os\nimport time\n\nfrom codexlens.entities import SearchResult, Symbol\n\nif TYPE_CHECKING:", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\embedding.py", - "score": 0.047931429239828446, - "excerpt": " def __init__(\n self,\n model_name: Optional[str] = None,\n use_gpu: bool = True,\n expand_dim: bool = True,\n ) -> None:\n from codexlens.semantic import SEMANTIC_...", - "content": " def __init__(\n self,\n model_name: Optional[str] = None,\n use_gpu: bool = True,\n expand_dim: bool = True,\n ) -> None:\n from codexlens.semantic import SEMANTIC_AVAILABLE\n\n if not SEMANTIC_AVAILABLE:\n raise ImportError(\n \"Semantic search dependencies not available. \"\n \"Install with: pip install codexlens[semantic]\"\n )\n\n self._model_name = model_name or self.DEFAULT_MODEL\n self._use_gpu = use_gpu\n self._expand_dim = expand_dim\n self._model = None\n self._native_dim: Optional[int] = None\n\n \n self._expansion_matrix: Optional[np.ndarray] = None\n\n @property\n def model_name(self) -> str:\n return self._model_name\n\n @property\n def embedding_dim(self) -> int:\n if self._expand_dim:\n return self.TARGET_DIM\n \n if self._native_dim is not None:\n return self._native_dim\n \n model_dims = {\n \"BAAI/bge-large-en-v1.5\": 1024,\n \"BAAI/bge-base-en-v1.5\": 768,\n \"BAAI/bge-small-en-v1.5\": 384,\n \"intfloat/multilingual-e5-large\": 1024,\n }\n return model_dims.get(self._model_name, 1024)\n\n @property\n def max_tokens(self) -> int:\n return 512 \n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "score": 0.04283104206542711, - "excerpt": "import threading\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom typing import Any, Dict, Iterable, List, Optional", - "content": "Provides intelligent load balancing across multiple LiteLLM embedding endpoints\nto maximize throughput while respecting rate limits.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport random\nimport threading\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom typing import Any, Dict, Iterable, List, Optional\n\nimport numpy as np\n\nfrom .base import BaseEmbedder\n\nlogger = logging.getLogger(__name__)\n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "score": 0.036886112765573215, - "excerpt": "- Direct subprocess spawning of language servers\n- JSON-RPC 2.0 communication over stdin/stdout\n- Multi-language support via configuration file (lsp-servers.json)\n- Process lifecycle management with a...", - "content": "\"\"\"Standalone Language Server Manager for direct LSP communication.\n\nThis module provides direct communication with language servers via JSON-RPC over stdio,\neliminating the need for VSCode Bridge. Similar to cclsp architecture.\n\nFeatures:\n- Direct subprocess spawning of language servers\n- JSON-RPC 2.0 communication over stdin/stdout\n- Multi-language support via configuration file (lsp-servers.json)\n- Process lifecycle management with auto-restart\n- Compatible interface with existing LspBridge\n\"\"\"\n\nfrom __future__ import annotations\n\nimport asyncio\nimport json\nimport logging\nimport os", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\models.py", - "score": 0.03457410829143062, - "excerpt": " container: Containing class/module (if any)\n score: Match score for ranking\n return {k: v for k, v in asdict(self).items() if v is not None}\n\n\n# =================================...", - "content": " container: Containing class/module (if any)\n score: Match score for ranking\n return {k: v for k, v in asdict(self).items() if v is not None}\n\n\n# =============================================================================\n# Section 4.4: find_references dataclasses\n# =============================================================================\n\n@dataclass\nclass ReferenceResult:\n file_path: str\n line: int\n column: int\n context_line: str\n relationship: str # call | import | type_annotation | inheritance\n\n def to_dict(self) -> dict:\n return asdict(self)\n\n\n@dataclass\nclass GroupedReferences:\n definition: DefinitionResult\n references: List[ReferenceResult] = field(default_factory=list)\n\n def to_dict(self) -> dict:\n return {\n \"definition\": self.definition.to_dict(),\n \"references\": [r.to_dict() for r in self.references],\n }\n\n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py", - "score": 0.03341093379138448, - "excerpt": "\n if TREE_SITTER_AVAILABLE:\n self._initialize_parser()\n\n def _initialize_parser(self) -> None:\n if TreeSitterParser is None or TreeSitterLanguage is None:\n retur...", - "content": "\n if TREE_SITTER_AVAILABLE:\n self._initialize_parser()\n\n def _initialize_parser(self) -> None:\n if TreeSitterParser is None or TreeSitterLanguage is None:\n return\n\n try:\n \n if self.language_id == \"python\":\n import tree_sitter_python\n self._language = TreeSitterLanguage(tree_sitter_python.language())\n elif self.language_id == \"javascript\":\n import tree_sitter_javascript\n self._language = TreeSitterLanguage(tree_sitter_javascript.language())\n elif self.language_id == \"typescript\":\n import tree_sitter_typescript\n \n if self.path is not None and self.path.suffix.lower() == \".tsx\":\n self._language = TreeSitterLanguage(tree_sitter_typescript.language_tsx())\n else:\n self._language = TreeSitterLanguage(tree_sitter_typescript.language_typescript())\n else:\n return\n\n \n self._parser = TreeSitterParser()\n if hasattr(self._parser, \"set_language\"):\n self._parser.set_language(self._language) \n else:\n self._parser.language = self._language \n\n except Exception:\n \n self._parser = None\n self._language = None\n\n def is_available(self) -> bool:\n return self._parser is not None and self._language is not None\n\n def _parse_tree(self, text: str) -> Optional[tuple[bytes, TreeSitterNode]]:\n if not self.is_available() or self._parser is None:\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py", - "score": 0.029568673189485736, - "excerpt": "\nimport logging\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import List, Optional", - "content": "\"\"\"Incremental indexer for processing file changes.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import List, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.storage.global_index import GlobalSymbolIndex\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py", - "score": 0.029334400167733504, - "excerpt": "\nfrom collections import defaultdict\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Literal\n", - "content": "\nUse cases:\n- Prioritize commonly called methods/functions\n- Filter out one-off results that may be less relevant\n- Deduplicate results pointing to the same symbol from different locations\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom collections import defaultdict\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Literal\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\n@dataclass", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\association_tree\\data_structures.py", - "score": 0.027925539288870704, - "excerpt": "\n def __len__(self) -> int:\n return len(self.all_nodes)\n\n def __repr__(self) -> str:\n return (\n f\"CallTree(roots={len(self.roots)}, nodes={len(self.all_nodes)}, \"\n ...", - "content": "\n def __len__(self) -> int:\n return len(self.all_nodes)\n\n def __repr__(self) -> str:\n return (\n f\"CallTree(roots={len(self.roots)}, nodes={len(self.all_nodes)}, \"\n f\"depth={self.depth_reached})\"\n )\n\n\n@dataclass\nclass UniqueNode:\n\n file_path: str\n name: str\n kind: str\n range: Range\n min_depth: int = 0\n occurrences: int = 1\n paths: List[List[str]] = field(default_factory=list)\n context_nodes: List[str] = field(default_factory=list)\n score: float = 0.0\n\n @property\n def node_key(self) -> tuple[str, int, int]:\n return (\n self.file_path,\n self.range.start_line,\n self.range.end_line,\n )\n\n def add_path(self, path: List[str]) -> None:\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "score": 0.024369821963687643, - "excerpt": "def _get_configured_index_root() -> Path:\n \"\"\"Get the index root from environment or config file.\n", - "content": "def _get_configured_index_root() -> Path:\n \"\"\"Get the index root from environment or config file.\n\n Priority order:\n 1. CODEXLENS_INDEX_DIR environment variable\n 2. index_dir from ~/.codexlens/config.json\n 3. Default: ~/.codexlens/indexes\n \"\"\"\n env_override = os.getenv(\"CODEXLENS_INDEX_DIR\")\n if env_override:\n return Path(env_override).expanduser().resolve()\n\n config_file = Path.home() / \".codexlens\" / \"config.json\"\n if config_file.exists():\n try:\n cfg = json.loads(config_file.read_text(encoding=\"utf-8\"))\n if \"index_dir\" in cfg:\n return Path(cfg[\"index_dir\"]).expanduser().resolve()\n except (json.JSONDecodeError, OSError):\n pass\n\n return Path.home() / \".codexlens\" / \"indexes\"", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "score": 0.023949795081967214, - "excerpt": "class VectorStore:\n \"\"\"SQLite-based vector storage with HNSW-accelerated similarity search.\n", - "content": "class VectorStore:\n \"\"\"SQLite-based vector storage with HNSW-accelerated similarity search.\n\n Performance optimizations:\n - HNSW index for O(log N) approximate nearest neighbor search\n - Embedding matrix cached in memory for batch similarity computation (fallback)\n - NumPy vectorized operations instead of Python loops (fallback)\n - Lazy content loading - only fetch full content for top-k results\n - Thread-safe cache invalidation\n - Bulk insert mode for efficient batch operations\n \"\"\"\n\n # Default embedding dimension (used when creating new index)\n DEFAULT_DIM = 768\n\n def __init__(self, db_path: str | Path) -> None:\n if not NUMPY_AVAILABLE:\n raise ImportError(\n \"Semantic search dependencies not available. \"\n \"Install with: pip install codexlens[semantic]\"\n )\n\n self.db_path = Path(db_path)\n self.db_path.parent.mkdir(parents=True, exist_ok=True)\n\n # Embedding cache for fast similarity search (fallback)\n self._cache_lock = threading.RLock()\n self._embedding_matrix: Optional[np.ndarray] = None\n self._embedding_norms: Optional[np.ndarray] = None\n self._chunk_ids: Optional[List[int]] = None\n self._cache_version: int = 0\n\n # ANN index for O(log N) search\n self._ann_index: Optional[ANNIndex] = None\n self._ann_dim: Optional[int] = None\n self._ann_write_lock = threading.Lock() # Protects ANN index modifications\n\n # Bulk insert mode tracking\n self._bulk_insert_mode: bool = False\n self._bulk_insert_ids: List[int] = []\n self._bulk_insert_embeddings: List[np.ndarray] = []\n\n self._init_schema()\n self._init_ann_index()\n\n def _init_schema(self) -> None:\n \"\"\"Initialize vector storage schema.\"\"\"\n with sqlite3.connect(self.db_path) as conn:\n # Enable memory mapping for faster reads\n conn.execute(\"PRAGMA mmap_size = 30000000000\") # 30GB limit\n conn.execute(\"\"\"\n CREATE TABLE IF NOT EXISTS semantic_chunks (\n id INTEGER PRIMARY KEY AUTOINCREMENT,\n file_path TEXT NOT NULL,\n content TEXT NOT NULL,\n embedding BLOB NOT NULL,\n metadata TEXT,\n category TEXT DEFAULT 'code',\n created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n )\n \"\"\")\n conn.execute(\"\"\"\n CREATE INDEX IF NOT EXISTS idx_chunks_file\n ON semantic_chunks(file_path)\n \"\"\")\n conn.execute(\"\"\"\n CREATE INDEX IF NOT EXISTS idx_chunks_category\n ON semantic_chunks(category)\n \"\"\")\n # Model configuration table - tracks which model generated the embeddings\n conn.execute(\"\"\"\n CREATE TABLE IF NOT EXISTS embeddings_config (\n id INTEGER PRIMARY KEY CHECK (id = 1),\n model_profile TEXT NOT NULL,\n model_name TEXT NOT NULL,\n embedding_dim INTEGER NOT NULL,\n backend TEXT NOT NULL DEFAULT 'fastembed',\n created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n )\n \"\"\")\n\n # Migration: Add backend column to existing tables\n self._migrate_backend_column(conn)\n # Migration: Add category column\n self._migrate_category_column(conn)\n\n conn.commit()\n\n def _migrate_backend_column(self, conn: sqlite3.Connection) -> None:\n \"\"\"Add backend column to existing embeddings_config table if not present.\n\n Args:\n conn: Active SQLite connection\n \"\"\"\n # Check if backend column exists\n cursor = conn.execute(\"PRAGMA table_info(embeddings_config)\")\n columns = [row[1] for row in cursor.fetchall()]\n\n if 'backend' not in columns:\n logger.info(\"Migrating embeddings_config table: adding backend column\")\n conn.execute(\"\"\"\n ALTER TABLE embeddings_config\n ADD COLUMN backend TEXT NOT NULL DEFAULT 'fastembed'\n \"\"\")\n\n def _migrate_category_column(self, conn: sqlite3.Connection) -> None:\n \"\"\"Add category column to existing semantic_chunks table if not present.\n\n Args:\n conn: Active SQLite connection\n \"\"\"\n # Check if category column exists\n cursor = conn.execute(\"PRAGMA table_info(semantic_chunks)\")\n columns = [row[1] for row in cursor.fetchall()]\n\n if 'category' not in columns:\n logger.info(\"Migrating semantic_chunks table: adding category column\")\n conn.execute(\"\"\"\n ALTER TABLE semantic_chunks\n ADD COLUMN category TEXT DEFAULT 'code'\n \"\"\")\n # Create index for fast category filtering\n conn.execute(\"\"\"\n CREATE INDEX IF NOT EXISTS idx_chunks_category\n ON semantic_chunks(category)\n \"\"\")\n\n def _init_ann_index(self) -> None:\n \"\"\"Initialize ANN index (lazy loading from existing data).\"\"\"\n if not HNSWLIB_AVAILABLE:\n logger.debug(\"hnswlib not available, using brute-force search\")\n return\n\n # Try to detect embedding dimension from existing data\n dim = self._detect_embedding_dim()\n if dim is None:\n # No data yet, will initialize on first add\n logger.debug(\"No embeddings found, ANN index will be created on first add\")\n return\n\n self._ann_dim = dim\n\n try:\n self._ann_index = ANNIndex(self.db_path, dim)\n if self._ann_index.load():\n logger.debug(\n \"Loaded ANN index with %d vectors\", self._ann_index.count()\n )\n else:\n # Index file doesn't exist, try to build from SQLite data\n logger.debug(\"ANN index file not found, rebuilding from SQLite\")\n self._rebuild_ann_index_internal()\n except Exception as e:\n logger.warning(\"Failed to initialize ANN index: %s\", e)\n self._ann_index = None\n\n def _detect_embedding_dim(self) -> Optional[int]:\n \"\"\"Detect embedding dimension from existing data.\"\"\"\n with sqlite3.connect(self.db_path) as conn:\n row = conn.execute(\n \"SELECT embedding FROM semantic_chunks LIMIT 1\"\n ).fetchone()\n if row and row[0]:\n # Embedding is stored as float32 blob\n blob = row[0]\n return len(blob) // np.dtype(np.float32).itemsize\n return None\n\n @property\n def dimension(self) -> Optional[int]:\n \"\"\"Return the dimension of embeddings in the store.\n\n Returns:\n Embedding dimension if available, None if store is empty.\n \"\"\"\n if self._ann_dim is not None:\n return self._ann_dim\n self._ann_dim = self._detect_embedding_dim()\n return self._ann_dim\n\n def _rebuild_ann_index_internal(self) -> int:\n \"\"\"Internal method to rebuild ANN index from SQLite data.\"\"\"\n if self._ann_index is None:\n return 0\n\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\"PRAGMA mmap_size = 30000000000\")\n rows = conn.execute(\n \"SELECT id, embedding FROM semantic_chunks\"\n ).fetchall()\n\n if not rows:\n return 0\n\n # Extract IDs and embeddings\n ids = [r[0] for r in rows]\n embeddings = np.vstack([\n np.frombuffer(r[1], dtype=np.float32) for r in rows\n ])\n\n # Add to ANN index\n self._ann_index.add_vectors(ids, embeddings)\n self._ann_index.save()\n\n logger.info(\"Rebuilt ANN index with %d vectors\", len(ids))\n return len(ids)\n\n def rebuild_ann_index(self) -> int:\n \"\"\"Rebuild HNSW index from all chunks in SQLite.\n\n Use this method to:\n - Migrate existing data to use ANN search\n - Repair corrupted index\n - Reclaim space after many deletions\n\n Returns:\n Number of vectors indexed.\n \"\"\"\n if not HNSWLIB_AVAILABLE:\n logger.warning(\"hnswlib not available, cannot rebuild ANN index\")\n return 0\n\n # Detect dimension\n dim = self._detect_embedding_dim()\n if dim is None:\n logger.warning(\"No embeddings found, cannot rebuild ANN index\")\n return 0\n\n self._ann_dim = dim\n\n # Create new index\n try:\n self._ann_index = ANNIndex(self.db_path, dim)\n return self._rebuild_ann_index_internal()\n except Exception as e:\n logger.error(\"Failed to rebuild ANN index: %s\", e)\n self._ann_index = None\n return 0\n\n def _invalidate_cache(self) -> None:\n \"\"\"Invalidate the embedding cache (thread-safe).\"\"\"\n with self._cache_lock:\n self._embedding_matrix = None\n self._embedding_norms = None\n self._chunk_ids = None\n self._cache_version += 1\n\n def _refresh_cache(self) -> bool:\n \"\"\"Load embeddings into numpy matrix for fast similarity search.\n\n Returns:\n True if cache was refreshed successfully, False if no data.\n \"\"\"\n with self._cache_lock:\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\"PRAGMA mmap_size = 30000000000\")\n rows = conn.execute(\n \"SELECT id, embedding FROM semantic_chunks\"\n ).fetchall()\n\n if not rows:\n self._embedding_matrix = None\n self._embedding_norms = None\n self._chunk_ids = None\n return False\n\n # Extract IDs and embeddings\n self._chunk_ids = [r[0] for r in rows]\n\n # Bulk convert binary blobs to numpy matrix\n embeddings = [\n np.frombuffer(r[1], dtype=np.float32) for r in rows\n ]\n self._embedding_matrix = np.vstack(embeddings)\n\n # Pre-compute norms for faster similarity calculation\n self._embedding_norms = np.linalg.norm(\n self._embedding_matrix, axis=1, keepdims=True\n )\n # Avoid division by zero\n self._embedding_norms = np.where(\n self._embedding_norms == 0, EPSILON, self._embedding_norms\n )\n\n return True\n\n def _ensure_ann_index(self, dim: int) -> bool:\n \"\"\"Ensure ANN index is initialized with correct dimension.\n\n This method is thread-safe and uses double-checked locking.\n\n Args:\n dim: Embedding dimension\n\n Returns:\n True if ANN index is ready, False otherwise\n \"\"\"\n if not HNSWLIB_AVAILABLE:\n return False\n\n # Fast path: index already initialized (no lock needed)\n if self._ann_index is not None:\n return True\n\n # Slow path: acquire lock for initialization\n with self._ann_write_lock:\n # Double-check after acquiring lock\n if self._ann_index is not None:\n return True\n\n try:\n self._ann_dim = dim\n self._ann_index = ANNIndex(self.db_path, dim)\n self._ann_index.load() # Try to load existing\n return True\n except Exception as e:\n logger.warning(\"Failed to initialize ANN index: %s\", e)\n self._ann_index = None\n return False\n\n def add_chunk(\n self, chunk: SemanticChunk, file_path: str, category: str = \"code\"\n ) -> int:\n \"\"\"Add a single chunk with its embedding.\n\n Args:\n chunk: SemanticChunk with embedding\n file_path: Path to the source file\n category: File category ('code' or 'doc'), default 'code'\n\n Returns:\n The inserted chunk ID.\n \"\"\"\n if chunk.embedding is None:\n raise ValueError(\"Chunk must have embedding before adding to store\")\n\n embedding_arr = np.array(chunk.embedding, dtype=np.float32)\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n\n with sqlite3.connect(self.db_path) as conn:\n cursor = conn.execute(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n (file_path, chunk.content, embedding_blob, metadata_json, category)\n )\n conn.commit()\n chunk_id = cursor.lastrowid or 0\n\n # Add to ANN index\n if self._ensure_ann_index(len(chunk.embedding)):\n with self._ann_write_lock:\n try:\n self._ann_index.add_vectors([chunk_id], embedding_arr.reshape(1, -1))\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return chunk_id\n\n def add_chunks(\n self, chunks: List[SemanticChunk], file_path: str, category: str = \"code\"\n ) -> List[int]:\n \"\"\"Add multiple chunks with embeddings (batch insert).\n\n Args:\n chunks: List of SemanticChunk objects with embeddings\n file_path: Path to the source file\n category: File category ('code' or 'doc'), default 'code'\n\n Returns:\n List of inserted chunk IDs.\n \"\"\"\n if not chunks:\n return []\n\n # Prepare batch data\n batch_data = []\n embeddings_list = []\n for chunk in chunks:\n if chunk.embedding is None:\n raise ValueError(\"All chunks must have embeddings\")\n embedding_arr = np.array(chunk.embedding, dtype=np.float32)\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))\n embeddings_list.append(embedding_arr)\n\n # Batch insert to SQLite\n with sqlite3.connect(self.db_path) as conn:\n # Get starting ID before insert\n row = conn.execute(\"SELECT MAX(id) FROM semantic_chunks\").fetchone()\n start_id = (row[0] or 0) + 1\n\n conn.executemany(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n batch_data\n )\n conn.commit()\n # Calculate inserted IDs based on starting ID\n ids = list(range(start_id, start_id + len(chunks)))\n\n # Add to ANN index\n if embeddings_list and self._ensure_ann_index(len(embeddings_list[0])):\n with self._ann_write_lock:\n try:\n embeddings_matrix = np.vstack(embeddings_list)\n self._ann_index.add_vectors(ids, embeddings_matrix)\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add batch to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return ids\n\n def add_chunks_batch(\n self,\n chunks_with_paths: List[Tuple[SemanticChunk, str]],\n update_ann: bool = True,\n auto_save_ann: bool = True,\n categories: Optional[List[str]] = None,\n ) -> List[int]:\n \"\"\"Batch insert chunks from multiple files in a single transaction.\n\n This method is optimized for bulk operations during index generation.\n\n Args:\n chunks_with_paths: List of (chunk, file_path) tuples\n update_ann: If True, update ANN index with new vectors (default: True)\n auto_save_ann: If True, save ANN index after update (default: True).\n Set to False for bulk inserts to reduce I/O overhead.\n categories: Optional list of categories per chunk. If None, defaults to 'code'.\n If provided, must match length of chunks_with_paths.\n\n Returns:\n List of inserted chunk IDs\n \"\"\"\n if not chunks_with_paths:\n return []\n\n batch_size = len(chunks_with_paths)\n\n # Validate categories if provided\n if categories is not None and len(categories) != batch_size:\n raise ValueError(\n f\"categories length ({len(categories)}) must match \"\n f\"chunks_with_paths length ({batch_size})\"\n )\n\n # Prepare batch data\n batch_data = []\n embeddings_list = []\n for i, (chunk, file_path) in enumerate(chunks_with_paths):\n if chunk.embedding is None:\n raise ValueError(\"All chunks must have embeddings\")\n # Optimize: avoid repeated np.array() if already numpy\n if isinstance(chunk.embedding, np.ndarray):\n embedding_arr = chunk.embedding.astype(np.float32)\n else:\n embedding_arr = np.array(chunk.embedding, dtype=np.float32)\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n category = categories[i] if categories else \"code\"\n batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))\n embeddings_list.append(embedding_arr)\n\n # Batch insert to SQLite in single transaction\n with sqlite3.connect(self.db_path) as conn:\n # Get starting ID before insert\n row = conn.execute(\"SELECT MAX(id) FROM semantic_chunks\").fetchone()\n start_id = (row[0] or 0) + 1\n\n _validate_chunk_id_range(start_id, batch_size)\n\n conn.executemany(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n batch_data\n )\n conn.commit()\n # Calculate inserted IDs based on starting ID\n ids = list(range(start_id, start_id + batch_size))\n\n # Handle ANN index updates\n if embeddings_list and update_ann and self._ensure_ann_index(len(embeddings_list[0])):\n with self._ann_write_lock:\n # In bulk insert mode, accumulate for later batch update\n if self._bulk_insert_mode:\n self._bulk_insert_ids.extend(ids)\n self._bulk_insert_embeddings.extend(embeddings_list)\n else:\n # Normal mode: update immediately\n try:\n embeddings_matrix = np.vstack(embeddings_list)\n self._ann_index.add_vectors(ids, embeddings_matrix)\n if auto_save_ann:\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add batch to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return ids\n\n def add_chunks_batch_numpy(\n self,\n chunks_with_paths: List[Tuple[SemanticChunk, str]],\n embeddings_matrix: np.ndarray,\n update_ann: bool = True,\n auto_save_ann: bool = True,\n categories: Optional[List[str]] = None,\n ) -> List[int]:\n \"\"\"Batch insert chunks with pre-computed numpy embeddings matrix.\n\n This method accepts embeddings as a numpy matrix to avoid list->array conversions.\n Useful when embeddings are already in numpy format from batch encoding.\n\n Args:\n chunks_with_paths: List of (chunk, file_path) tuples (embeddings can be None)\n embeddings_matrix: Pre-computed embeddings as (N, D) numpy array\n update_ann: If True, update ANN index with new vectors (default: True)\n auto_save_ann: If True, save ANN index after update (default: True)\n categories: Optional list of categories per chunk. If None, defaults to 'code'.\n\n Returns:\n List of inserted chunk IDs\n \"\"\"\n if not chunks_with_paths:\n return []\n\n batch_size = len(chunks_with_paths)\n\n if len(chunks_with_paths) != embeddings_matrix.shape[0]:\n raise ValueError(\n f\"Mismatch: {len(chunks_with_paths)} chunks but \"\n f\"{embeddings_matrix.shape[0]} embeddings\"\n )\n\n # Validate categories if provided\n if categories is not None and len(categories) != batch_size:\n raise ValueError(\n f\"categories length ({len(categories)}) must match \"\n f\"chunks_with_paths length ({batch_size})\"\n )\n\n # Ensure float32 format\n embeddings_matrix = embeddings_matrix.astype(np.float32)\n\n # Prepare batch data\n batch_data = []\n for i, (chunk, file_path) in enumerate(chunks_with_paths):\n embedding_arr = embeddings_matrix[i]\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n category = categories[i] if categories else \"code\"\n batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))\n\n # Batch insert to SQLite in single transaction\n with sqlite3.connect(self.db_path) as conn:\n # Get starting ID before insert\n row = conn.execute(\"SELECT MAX(id) FROM semantic_chunks\").fetchone()\n start_id = (row[0] or 0) + 1\n\n _validate_chunk_id_range(start_id, batch_size)\n\n conn.executemany(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n batch_data\n )\n conn.commit()\n # Calculate inserted IDs based on starting ID\n ids = list(range(start_id, start_id + batch_size))\n\n # Handle ANN index updates\n if update_ann and self._ensure_ann_index(embeddings_matrix.shape[1]):\n with self._ann_write_lock:\n # In bulk insert mode, accumulate for later batch update\n if self._bulk_insert_mode:\n self._bulk_insert_ids.extend(ids)\n # Split matrix into individual arrays for accumulation\n self._bulk_insert_embeddings.extend([embeddings_matrix[i] for i in range(len(ids))])\n else:\n # Normal mode: update immediately\n try:\n self._ann_index.add_vectors(ids, embeddings_matrix)\n if auto_save_ann:\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add batch to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return ids\n\n def begin_bulk_insert(self) -> None:\n \"\"\"Begin bulk insert mode - disable ANN auto-update for better performance.\n\n Usage:\n store.begin_bulk_insert()\n try:\n for batch in batches:\n store.add_chunks_batch(batch, auto_save_ann=False)\n finally:\n store.end_bulk_insert()\n\n Or use context manager:\n with store.bulk_insert():\n for batch in batches:\n store.add_chunks_batch(batch)\n \"\"\"\n with self._ann_write_lock:\n self._bulk_insert_mode = True\n self._bulk_insert_ids.clear()\n self._bulk_insert_embeddings.clear()\n logger.debug(\"Entered bulk insert mode\")\n\n def end_bulk_insert(self) -> None:\n \"\"\"End bulk insert mode and rebuild ANN index from accumulated data.\n\n This method should be called after all bulk inserts are complete to\n update the ANN index in a single batch operation.\n \"\"\"\n with self._ann_write_lock:\n if not self._bulk_insert_mode:\n logger.warning(\"end_bulk_insert called but not in bulk insert mode\")\n return\n\n self._bulk_insert_mode = False\n bulk_ids = list(self._bulk_insert_ids)\n bulk_embeddings = list(self._bulk_insert_embeddings)\n self._bulk_insert_ids.clear()\n self._bulk_insert_embeddings.clear()\n\n # Update ANN index with accumulated data.\n if bulk_ids and bulk_embeddings:\n if self._ensure_ann_index(len(bulk_embeddings[0])):\n with self._ann_write_lock:\n try:\n embeddings_matrix = np.vstack(bulk_embeddings)\n self._ann_index.add_vectors(bulk_ids, embeddings_matrix)\n self._ann_index.save()\n logger.info(\n \"Bulk insert complete: added %d vectors to ANN index\",\n len(bulk_ids),\n )\n except Exception as e:\n logger.error(\"Failed to update ANN index after bulk insert: %s\", e)\n\n logger.debug(\"Exited bulk insert mode\")\n\n class BulkInsertContext:\n \"\"\"Context manager for bulk insert operations.\"\"\"\n\n def __init__(self, store: \"VectorStore\") -> None:\n self.store = store\n\n def __enter__(self) -> \"VectorStore\":\n self.store.begin_bulk_insert()\n return self.store\n\n def __exit__(self, exc_type, exc_val, exc_tb) -> None:\n self.store.end_bulk_insert()\n\n def bulk_insert(self) -> \"VectorStore.BulkInsertContext\":\n \"\"\"Return a context manager for bulk insert operations.\n\n Usage:\n with store.bulk_insert():\n for batch in batches:\n store.add_chunks_batch(batch)\n \"\"\"\n return self.BulkInsertContext(self)\n\n def delete_file_chunks(self, file_path: str) -> int:\n \"\"\"Delete all chunks for a file.\n\n Returns:\n Number of deleted chunks.\n \"\"\"\n # Get chunk IDs before deletion (for ANN index)\n chunk_ids_to_delete = []\n if self._ann_index is not None:\n with sqlite3.connect(self.db_path) as conn:\n rows = conn.execute(\n \"SELECT id FROM semantic_chunks WHERE file_path = ?\",\n (file_path,)\n ).fetchall()\n chunk_ids_to_delete = [r[0] for r in rows]\n\n # Delete from SQLite\n with sqlite3.connect(self.db_path) as conn:\n cursor = conn.execute(\n \"DELETE FROM semantic_chunks WHERE file_path = ?\",\n (file_path,)\n )\n conn.commit()\n deleted = cursor.rowcount\n\n # Remove from ANN index\n if deleted > 0 and self._ann_index is not None and chunk_ids_to_delete:\n with self._ann_write_lock:\n try:\n self._ann_index.remove_vectors(chunk_ids_to_delete)\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to remove from ANN index: %s\", e)\n\n if deleted > 0:\n self._invalidate_cache()\n return deleted\n\n def search_similar(\n self,\n query_embedding: List[float],\n top_k: int = 10,\n min_score: float = 0.0,\n return_full_content: bool = True,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Find chunks most similar to query embedding.\n\n Uses HNSW index for O(log N) search when available, falls back to\n brute-force NumPy search otherwise.\n\n Args:\n query_embedding: Query vector.\n top_k: Maximum results to return.\n min_score: Minimum cosine similarity score in [0.0, 1.0].\n return_full_content: If True, return full code block content.\n category: Optional category filter ('code' or 'doc'). If None, returns all.\n\n Returns:\n List of SearchResult ordered by similarity (highest first).\n \"\"\"\n query_vec = np.array(query_embedding, dtype=np.float32)\n\n if not 0.0 <= min_score <= 1.0:\n raise ValueError(\n f\"Invalid min_score: {min_score}. Must be within [0.0, 1.0] for cosine similarity.\"\n )\n\n # Try HNSW search first (O(log N))\n if (\n HNSWLIB_AVAILABLE\n and self._ann_index is not None\n and self._ann_index.is_loaded\n and self._ann_index.count() > 0\n ):\n try:\n return self._search_with_ann(\n query_vec, top_k, min_score, return_full_content, category\n )\n except Exception as e:\n logger.warning(\"ANN search failed, falling back to brute-force: %s\", e)\n\n # Fallback to brute-force search (O(N))\n return self._search_brute_force(\n query_vec, top_k, min_score, return_full_content, category\n )\n\n def _search_with_ann(\n self,\n query_vec: np.ndarray,\n top_k: int,\n min_score: float,\n return_full_content: bool,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Search using HNSW index (O(log N)).\n\n Args:\n query_vec: Query vector as numpy array\n top_k: Maximum results to return\n min_score: Minimum cosine similarity score in [0.0, 1.0]\n return_full_content: If True, return full code block content\n category: Optional category filter ('code' or 'doc')\n\n Returns:\n List of SearchResult ordered by similarity (highest first)\n \"\"\"\n # Limit top_k to available vectors to prevent hnswlib error\n ann_count = self._ann_index.count()\n # When category filtering, fetch more candidates to compensate for filtering\n fetch_k = top_k * 3 if category else top_k\n effective_top_k = min(fetch_k, ann_count) if ann_count > 0 else 0\n\n if effective_top_k == 0:\n return []\n\n # HNSW search returns (ids, distances)\n # For cosine space: distance = 1 - similarity\n ids, distances = self._ann_index.search(query_vec, effective_top_k)\n\n if ids is None or distances is None:\n logger.debug(\n \"ANN search returned null results (ids=%s, distances=%s)\",\n ids,\n distances,\n )\n return []\n\n if len(ids) == 0 or len(distances) == 0:\n logger.debug(\n \"ANN search returned empty results (ids=%s, distances=%s)\",\n ids,\n distances,\n )\n return []\n\n if len(ids) != len(distances):\n logger.warning(\n \"ANN search returned mismatched result lengths (%d ids, %d distances)\",\n len(ids),\n len(distances),\n )\n return []\n\n # Convert distances to similarity scores\n scores = [1.0 - d for d in distances]\n\n # Filter by min_score\n filtered = [\n (chunk_id, score)\n for chunk_id, score in zip(ids, scores)\n if score >= min_score\n ]\n\n if not filtered:\n return []\n\n top_ids = [f[0] for f in filtered]\n top_scores = [f[1] for f in filtered]\n\n # Fetch content from SQLite with category filtering\n results = self._fetch_results_by_ids(\n top_ids, top_scores, return_full_content, category\n )\n # Apply final limit after category filtering\n return results[:top_k]\n\n def _search_brute_force(\n self,\n query_vec: np.ndarray,\n top_k: int,\n min_score: float,\n return_full_content: bool,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Brute-force search using NumPy (O(N) fallback).\n\n Args:\n query_vec: Query vector as numpy array\n top_k: Maximum results to return\n min_score: Minimum cosine similarity score in [0.0, 1.0]\n return_full_content: If True, return full code block content\n category: Optional category filter ('code' or 'doc')\n\n Returns:\n List of SearchResult ordered by similarity (highest first)\n \"\"\"\n logger.warning(\n \"Using brute-force vector search (hnswlib not available). \"\n \"This may cause high memory usage for large indexes. \"\n \"Install hnswlib for better performance: pip install hnswlib\"\n )\n\n with self._cache_lock:\n # Refresh cache if needed\n if self._embedding_matrix is None:\n if not self._refresh_cache():\n return [] # No data\n\n # Vectorized cosine similarity\n query_vec = query_vec.reshape(1, -1)\n query_norm = np.linalg.norm(query_vec)\n if query_norm == 0:\n return []\n\n # Compute all similarities at once: (N,) scores\n # similarity = (A @ B.T) / (||A|| * ||B||)\n dot_products = np.dot(self._embedding_matrix, query_vec.T).flatten()\n scores = dot_products / (self._embedding_norms.flatten() * query_norm)\n\n # Filter by min_score and get top-k indices\n valid_mask = scores >= min_score\n valid_indices = np.where(valid_mask)[0]\n\n if len(valid_indices) == 0:\n return []\n\n # When category filtering, fetch more candidates to compensate for filtering\n fetch_k = top_k * 3 if category else top_k\n\n # Sort by score descending and take top candidates\n valid_scores = scores[valid_indices]\n sorted_order = np.argsort(valid_scores)[::-1][:fetch_k]\n top_indices = valid_indices[sorted_order]\n top_scores = valid_scores[sorted_order]\n\n # Get chunk IDs for top results\n top_ids = [self._chunk_ids[i] for i in top_indices]\n\n # Fetch content only for top-k results (lazy loading) with category filtering\n results = self._fetch_results_by_ids(\n top_ids, top_scores.tolist(), return_full_content, category\n )\n # Apply final limit after category filtering\n return results[:top_k]\n\n def _fetch_results_by_ids(\n self,\n chunk_ids: List[int],\n scores: List[float],\n return_full_content: bool,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Fetch full result data for specific chunk IDs.\n\n Args:\n chunk_ids: List of chunk IDs to fetch.\n scores: Corresponding similarity scores.\n return_full_content: Whether to include full content.\n category: Optional category filter ('code' or 'doc').\n\n Returns:\n List of SearchResult objects.\n \"\"\"\n if not chunk_ids:\n return []\n\n # Build parameterized query for IN clause\n placeholders = \",\".join(\"?\" * len(chunk_ids))\n _validate_sql_placeholders(placeholders, len(chunk_ids))\n\n # SQL injection prevention:\n # - Only a validated placeholders string (commas + '?') is interpolated into the query.\n # - User-provided values are passed separately via sqlite3 parameters.\n # - Category filter is added as a separate parameter\n if category:\n query = \"\"\"\n SELECT id, file_path, content, metadata\n FROM semantic_chunks\n WHERE id IN ({placeholders}) AND category = ?\n \"\"\".format(placeholders=placeholders)\n params = list(chunk_ids) + [category]\n else:\n query = \"\"\"\n SELECT id, file_path, content, metadata\n FROM semantic_chunks\n WHERE id IN ({placeholders})\n \"\"\".format(placeholders=placeholders)\n params = chunk_ids\n\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\"PRAGMA mmap_size = 30000000000\")\n rows = conn.execute(query, params).fetchall()\n\n # Build ID -> row mapping\n id_to_row = {r[0]: r for r in rows}\n\n results = []\n for chunk_id, score in zip(chunk_ids, scores):\n row = id_to_row.get(chunk_id)\n if not row:\n continue\n\n _, file_path, content, metadata_json = row\n metadata = json.loads(metadata_json) if metadata_json else {}\n\n # Build excerpt (short preview)\n excerpt = content[:200] + \"...\" if len(content) > 200 else content\n\n # Extract symbol information from metadata\n symbol_name = metadata.get(\"symbol_name\")\n symbol_kind = metadata.get(\"symbol_kind\")\n start_line = metadata.get(\"start_line\")\n end_line = metadata.get(\"end_line\")\n\n # Build Symbol object if we have symbol info\n symbol = None\n if symbol_name and symbol_kind and start_line and end_line:\n try:\n from codexlens.entities import Symbol\n symbol = Symbol(\n name=symbol_name,\n kind=symbol_kind,\n range=(start_line, end_line)\n )\n except Exception:\n pass\n\n results.append(SearchResult(\n path=file_path,\n score=score,\n excerpt=excerpt,\n content=content if return_full_content else None,\n symbol=symbol,\n metadata=metadata,\n start_line=start_line,\n end_line=end_line,\n symbol_name=symbol_name,\n symbol_kind=symbol_kind,\n ))\n\n return results\n\n def count_chunks(self) -> int:\n \"\"\"Count total chunks in store.\"\"\"\n with sqlite3.connect(self.db_path) as conn:\n row = conn.execute(\"SELECT COUNT(*) FROM semantic_chunks\").fetchone()\n return row[0] if row else 0\n\n def get_all_chunks(self) -> List[SemanticChunk]:\n \"\"\"Get all chunks from the store.\n\n Returns:\n List of SemanticChunk objects with id and content.\n \"\"\"\n with sqlite3.connect(self.db_path) as conn:\n conn.row_factory = sqlite3.Row\n rows = conn.execute(\n \"SELECT id, file_path, content, metadata FROM semantic_chunks\"\n ).fetchall()\n\n chunks = []\n for row in rows:\n chunks.append(SemanticChunk(\n id=row[\"id\"],\n content=row[\"content\"],\n file_path=row[\"file_path\"],\n metadata=json.loads(row[\"metadata\"]) if row[\"metadata\"] else None,\n ))\n return chunks\n\n def clear_cache(self) -> None:\n \"\"\"Manually clear the embedding cache.\"\"\"\n self._invalidate_cache()\n\n @property\n def ann_available(self) -> bool:\n \"\"\"Check if ANN index is available and ready.\"\"\"\n return (\n HNSWLIB_AVAILABLE\n and self._ann_index is not None\n and self._ann_index.is_loaded\n )\n\n @property\n def ann_count(self) -> int:\n \"\"\"Get number of vectors in ANN index.\"\"\"\n if self._ann_index is not None:\n return self._ann_index.count()\n return 0\n\n def get_model_config(self) -> Optional[Dict[str, Any]]:\n \"\"\"Get the model configuration used for embeddings in this store.\n\n Returns:\n Dictionary with model_profile, model_name, embedding_dim, backend, or None if not set.\n \"\"\"\n with sqlite3.connect(self.db_path) as conn:\n row = conn.execute(\n \"SELECT model_profile, model_name, embedding_dim, backend, created_at, updated_at \"\n \"FROM embeddings_config WHERE id = 1\"\n ).fetchone()\n if row:\n return {\n \"model_profile\": row[0],\n \"model_name\": row[1],\n \"embedding_dim\": row[2],\n \"backend\": row[3],\n \"created_at\": row[4],\n \"updated_at\": row[5],\n }\n return None\n\n def set_model_config(\n self, model_profile: str, model_name: str, embedding_dim: int, backend: str = 'fastembed'\n ) -> None:\n \"\"\"Set the model configuration for embeddings in this store.\n\n This should be called when generating new embeddings. If a different\n model was previously used, this will update the configuration.\n\n Args:\n model_profile: Model profile name (fast, code, minilm, etc.)\n model_name: Full model name (e.g., jinaai/jina-embeddings-v2-base-code)\n embedding_dim: Embedding dimension (e.g., 768)\n backend: Backend used for embeddings (fastembed or litellm, default: fastembed)\n \"\"\"\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\n \"\"\"\n INSERT INTO embeddings_config (id, model_profile, model_name, embedding_dim, backend)\n VALUES (1, ?, ?, ?, ?)\n ON CONFLICT(id) DO UPDATE SET\n model_profile = excluded.model_profile,\n model_name = excluded.model_name,\n embedding_dim = excluded.embedding_dim,\n backend = excluded.backend,\n updated_at = CURRENT_TIMESTAMP\n \"\"\",\n (model_profile, model_name, embedding_dim, backend)\n )\n conn.commit()\n\n def check_model_compatibility(\n self, model_profile: str, model_name: str, embedding_dim: int\n ) -> Tuple[bool, Optional[str]]:\n \"\"\"Check if the given model is compatible with existing embeddings.\n\n Args:\n model_profile: Model profile to check\n model_name: Model name to check\n embedding_dim: Embedding dimension to check\n\n Returns:\n Tuple of (is_compatible, warning_message).\n is_compatible is True if no existing config or configs match.\n warning_message is a user-friendly message if incompatible.\n \"\"\"\n existing = self.get_model_config()\n if existing is None:\n return True, None\n\n # Check dimension first (most critical)\n if existing[\"embedding_dim\"] != embedding_dim:\n return False, (\n f\"Dimension mismatch: existing embeddings use {existing['embedding_dim']}d \"\n f\"({existing['model_profile']}), but requested model uses {embedding_dim}d \"\n f\"({model_profile}). Use --force to regenerate all embeddings.\"\n )\n\n # Check model (different models with same dimension may have different semantic spaces)\n if existing[\"model_profile\"] != model_profile:\n return False, (\n f\"Model mismatch: existing embeddings use '{existing['model_profile']}' \"\n f\"({existing['model_name']}), but requested '{model_profile}' \"\n f\"({model_name}). Use --force to regenerate all embeddings.\"\n )\n\n return True, None\n\n def close(self) -> None:\n \"\"\"Close the vector store and release resources.\n\n This ensures SQLite connections are closed and ANN index is cleared,\n allowing temporary files to be deleted on Windows.\n \"\"\"\n with self._cache_lock:\n self._embedding_matrix = None\n self._embedding_norms = None\n self._chunk_ids = None\n\n with self._ann_write_lock:\n self._ann_index = None\n\n def __enter__(self) -> \"VectorStore\":\n \"\"\"Context manager entry.\"\"\"\n return self\n\n def __exit__(self, exc_type, exc_val, exc_tb) -> None:\n \"\"\"Context manager exit - close resources.\"\"\"\n self.close()", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py", - "score": 0.02356190140431283, - "excerpt": "\"\"\"Base classes for clustering strategies in the hybrid search pipeline.\n\nThis module defines the abstract base class for clustering strategies used", - "content": "\"\"\"Base classes for clustering strategies in the hybrid search pipeline.\n\nThis module defines the abstract base class for clustering strategies used\nin the staged hybrid search pipeline. Strategies cluster search results\nbased on their embeddings and select representative results from each cluster.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom dataclasses import dataclass, field", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "score": 0.022717150737751757, - "excerpt": "\nimport re\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Protocol", - "content": "\"\"\"Parser factory for CodexLens.\n\nPython and JavaScript/TypeScript parsing use Tree-Sitter grammars when\navailable. Regex fallbacks are retained to preserve the existing parser\ninterface and behavior in minimal environments.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport re\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Protocol\n\nfrom codexlens.config import Config\nfrom codexlens.entities import CodeRelationship, IndexedFile, RelationshipType, Symbol\nfrom codexlens.parsers.treesitter_parser import TreeSitterSymbolParser\n\n\nclass Parser(Protocol):\n def parse(self, text: str, path: Path) -> IndexedFile: ...", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py", - "score": 0.022282698690396483, - "excerpt": "\"\"\"codex-lens LSP Server implementation using pygls.\n\nThis module provides the main Language Server class and entry point.\n\"\"\"\n", - "content": "\"\"\"codex-lens LSP Server implementation using pygls.\n\nThis module provides the main Language Server class and entry point.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport argparse\nimport logging\nimport sys\nfrom pathlib import Path\nfrom typing import Optional\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "score": 0.022258499170812605, - "excerpt": " logger.log(level, \"[TIMING] %s: %.2fms\", name, elapsed_ms)\n\nfrom codexlens.config import Config\nfrom codexlens.config import VECTORS_HNSW_NAME\nfrom codexlens.entities import SearchResult", - "content": " logger: Logger instance to use\n level: Logging level (default DEBUG)\n \"\"\"\n start = time.perf_counter()\n try:\n yield\n finally:\n elapsed_ms = (time.perf_counter() - start) * 1000\n logger.log(level, \"[TIMING] %s: %.2fms\", name, elapsed_ms)\n\nfrom codexlens.config import Config\nfrom codexlens.config import VECTORS_HNSW_NAME\nfrom codexlens.entities import SearchResult\nfrom codexlens.search.ranking import (\n DEFAULT_WEIGHTS,\n FTS_FALLBACK_WEIGHTS,\n QueryIntent,\n apply_symbol_boost,\n cross_encoder_rerank,\n detect_query_intent,\n filter_results_by_category,", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "score": 0.022204010428648113, - "excerpt": "import threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple", - "content": "\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport hashlib\nimport re\nimport sqlite3\nimport threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple\n\nfrom codexlens.config import Config\nfrom codexlens.entities import CodeRelationship, SearchResult, Symbol\nfrom codexlens.errors import StorageError\nfrom codexlens.storage.global_index import GlobalSymbolIndex\n\n\n@dataclass", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "score": 0.022191896701700627, - "excerpt": "from typing import Callable, List, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore", - "content": "from __future__ import annotations\n\nimport json\nimport logging\nimport signal\nimport threading\nimport time\nfrom pathlib import Path\nfrom typing import Callable, List, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore\n\nfrom .events import FileEvent, IndexResult, PendingQueueStatus, WatcherConfig, WatcherStats\nfrom .file_watcher import FileWatcher\nfrom .incremental_indexer import IncrementalIndexer\n\nlogger = logging.getLogger(__name__)\n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "score": 0.021943278996721462, - "excerpt": "\nThis module provides the semantic_search() function for combining\nvector, structural, and keyword search with configurable fusion strategies.\n\"\"\"\n", - "content": "\"\"\"Semantic search API with RRF fusion.\n\nThis module provides the semantic_search() function for combining\nvector, structural, and keyword search with configurable fusion strategies.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import List, Optional\n\nfrom .models import SemanticResult\nfrom .utils import resolve_project", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py", - "score": 0.021943278996721462, - "excerpt": "from watchdog.events import FileSystemEventHandler\n\nfrom .events import ChangeType, FileEvent, WatcherConfig, PendingQueueStatus\nfrom ..config import Config\n", - "content": "\nimport logging\nimport threading\nimport time\nfrom pathlib import Path\nfrom typing import Callable, Dict, List, Optional\n\nfrom watchdog.observers import Observer\nfrom watchdog.events import FileSystemEventHandler\n\nfrom .events import ChangeType, FileEvent, WatcherConfig, PendingQueueStatus\nfrom ..config import Config\n\nlogger = logging.getLogger(__name__)\n\n# Maximum queue size to prevent unbounded memory growth\n# When exceeded, forces immediate flush to avoid memory exhaustion\nMAX_QUEUE_SIZE = 50000\n\n\nclass _CodexLensHandler(FileSystemEventHandler):", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py", - "score": 0.02150910700179165, - "excerpt": "\n \n env_api_base = _get_env_with_fallback(\"RERANKER_API_BASE\", self._workspace_root)\n self.api_base = (api_base or env_api_base or defaults[\"api_base\"]).strip().rstrip(\"/\")\n ...", - "content": "\n \n env_api_base = _get_env_with_fallback(\"RERANKER_API_BASE\", self._workspace_root)\n self.api_base = (api_base or env_api_base or defaults[\"api_base\"]).strip().rstrip(\"/\")\n self.endpoint = defaults[\"endpoint\"]\n\n \n env_model = _get_env_with_fallback(\"RERANKER_MODEL\", self._workspace_root)\n self.model_name = (model_name or env_model or defaults[\"default_model\"]).strip()\n if not self.model_name:\n raise ValueError(\"model_name cannot be blank\")\n\n \n resolved_key = api_key or _get_env_with_fallback(env_api_key, self._workspace_root) or \"\"\n resolved_key = resolved_key.strip()\n if not resolved_key:\n raise ValueError(\n f\"Missing API key for reranker provider '{self.provider}'. \"\n f\"Pass api_key=... or set ${env_api_key}.\"\n )\n self._api_key = resolved_key\n\n self.timeout_s = float(timeout) if timeout and float(timeout) > 0 else 30.0\n self.max_retries = int(max_retries) if max_retries and int(max_retries) >= 0 else 3\n self.backoff_base_s = float(backoff_base_s) if backoff_base_s and float(backoff_base_s) > 0 else 0.5\n self.backoff_max_s = float(backoff_max_s) if backoff_max_s and float(backoff_max_s) > 0 else 8.0\n\n headers = {\n \"Authorization\": f\"Bearer {self._api_key}\",\n \"Content-Type\": \"application/json\",\n }\n if self.provider == \"cohere\":\n headers.setdefault(\"Cohere-Version\", \"2022-12-06\")\n\n self._client = httpx.Client(\n base_url=self.api_base,\n headers=headers,\n timeout=self.timeout_s,\n )\n\n \n if max_input_tokens is not None:\n self._max_input_tokens = max_input_tokens\n else:\n \n model_lower = self.model_name.lower()\n if '8b' in model_lower or 'large' in model_lower:\n self._max_input_tokens = 32768\n else:\n self._max_input_tokens = 8192\n\n @property\n def max_input_tokens(self) -> int:\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "score": 0.02051605801605802, - "excerpt": " locked_config = get_locked_model_config()\n\n if locked_config is None:\n return {\n \"is_locked\": False,\n \"has_conflict\": False,\n \"locked_config\": None,\n ...", - "content": " locked_config = get_locked_model_config()\n\n if locked_config is None:\n return {\n \"is_locked\": False,\n \"has_conflict\": False,\n \"locked_config\": None,\n \"target_config\": {\"backend\": target_backend, \"model\": target_model},\n }\n\n has_conflict = (\n locked_config[\"backend\"] != target_backend or\n locked_config[\"model\"] != target_model\n )\n\n return {\n \"is_locked\": True,\n \"has_conflict\": has_conflict,\n \"locked_config\": locked_config,\n \"target_config\": {\"backend\": target_backend, \"model\": target_model},\n }", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\mcp\\provider.py", - "score": 0.020229904287875303, - "excerpt": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Optional, List, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import (\n MCPContext,\n SymbolInfo,\n Re...", - "content": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Optional, List, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import (\n MCPContext,\n SymbolInfo,\n ReferenceInfo,\n RelatedSymbol,\n)\n\nif TYPE_CHECKING:\n from codexlens.storage.global_index import GlobalSymbolIndex\n from codexlens.storage.registry import RegistryStore\n from codexlens.search.chain_search import ChainSearchEngine\n\nlogger = logging.getLogger(__name__)\n\n\nclass MCPProvider:\n\n def __init__(\n self,\n global_index: \"GlobalSymbolIndex\",\n search_engine: \"ChainSearchEngine\",\n registry: \"RegistryStore\",\n ) -> None:\n self.global_index = global_index\n self.search_engine = search_engine\n self.registry = registry\n\n def build_context(\n self,\n symbol_name: str,\n context_type: str = \"symbol_explanation\",\n include_references: bool = True,\n include_related: bool = True,\n max_references: int = 10,\n ) -> Optional[MCPContext]:\n \n symbols = self.global_index.search(symbol_name, prefix_mode=False, limit=1)\n\n if not symbols:\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\mcp\\hooks.py", - "score": 0.020007053720837744, - "excerpt": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Any, Dict, Optional, Callable, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import MCPContext\n\nif TYPE_CHECK...", - "content": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Any, Dict, Optional, Callable, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import MCPContext\n\nif TYPE_CHECKING:\n from codexlens.mcp.provider import MCPProvider\n\nlogger = logging.getLogger(__name__)\n\n\nclass HookManager:\n\n def __init__(self, mcp_provider: \"MCPProvider\") -> None:\n self.mcp_provider = mcp_provider\n self._pre_hooks: Dict[str, Callable] = {}\n self._post_hooks: Dict[str, Callable] = {}\n\n \n self._register_default_hooks()\n\n def _register_default_hooks(self) -> None:\n self._pre_hooks[\"explain\"] = self._pre_explain_hook\n self._pre_hooks[\"refactor\"] = self._pre_refactor_hook\n self._pre_hooks[\"document\"] = self._pre_document_hook\n\n def execute_pre_hook(\n self,\n action: str,\n params: Dict[str, Any],\n ) -> Optional[MCPContext]:\n hook = self._pre_hooks.get(action)\n\n if not hook:\n logger.debug(f\"No pre-hook for action: {action}\")\n return None\n\n try:\n return hook(params)\n except Exception as e:\n logger.error(f\"Pre-hook failed for {action}: {e}\")\n return None\n\n def execute_post_hook(\n self,\n action: str,\n result: Any,\n ) -> None:\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\association_tree\\deduplicator.py", - "score": 0.019921615989390927, - "excerpt": "\nfrom __future__ import annotations\n\nimport logging\nfrom typing import Dict, List, Optional\n\nfrom .data_structures import (\n CallTree,\n TreeNode,\n UniqueNode,\n)\n\nlogger = logging.getLogger(__...", - "content": "\nfrom __future__ import annotations\n\nimport logging\nfrom typing import Dict, List, Optional\n\nfrom .data_structures import (\n CallTree,\n TreeNode,\n UniqueNode,\n)\n\nlogger = logging.getLogger(__name__)\n\n\n\nKIND_WEIGHTS: Dict[str, float] = {\n \n \"function\": 1.0,\n \"method\": 1.0,\n \"12\": 1.0, \n \"6\": 1.0, \n \n \"class\": 0.8,\n \"5\": 0.8, \n \n \"interface\": 0.7,\n \"11\": 0.7, \n \"type\": 0.6,\n \n \"constructor\": 0.9,\n \"9\": 0.9, \n \n \"variable\": 0.4,\n \"13\": 0.4, \n \"constant\": 0.5,\n \"14\": 0.5, \n \n \"unknown\": 0.3,\n}\n\n\nclass ResultDeduplicator:\n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\factory.py", - "score": 0.01962803701934137, - "excerpt": " if not ok:\n raise ImportError(err)\n\n from .api_reranker import APIReranker\n\n _ = device \n resolved_model_name = (model_name or \"\").strip() or None\n retu...", - "content": " if not ok:\n raise ImportError(err)\n\n from .api_reranker import APIReranker\n\n _ = device \n resolved_model_name = (model_name or \"\").strip() or None\n return APIReranker(model_name=resolved_model_name, **kwargs)\n\n raise ValueError(\n f\"Unknown backend: {backend}. Supported backends: 'fastembed', 'onnx', 'api', 'litellm', 'legacy'\"\n )\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "score": 0.015740967294674172, - "excerpt": "import time\nfrom concurrent.futures import ProcessPoolExecutor, as_completed\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set, Tuple", - "content": "\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport os\nimport re\nimport sqlite3\nimport time\nfrom concurrent.futures import ProcessPoolExecutor, as_completed\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set, Tuple\n\nfrom codexlens.config import Config\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.storage.global_index import GlobalSymbolIndex\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import ProjectInfo, RegistryStore\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "score": 0.01569458021070924, - "excerpt": "\nLightweight Mode:\n The ChunkConfig supports a `skip_token_count` option for performance optimization.\n When enabled, token counting uses a fast character-based estimation (char/4)\n instead o...", - "content": "\"\"\"Code chunking strategies for semantic search.\n\nThis module provides various chunking strategies for breaking down source code\ninto semantic chunks suitable for embedding and search.\n\nLightweight Mode:\n The ChunkConfig supports a `skip_token_count` option for performance optimization.\n When enabled, token counting uses a fast character-based estimation (char/4)\n instead of expensive tiktoken encoding.\n\n Use cases for lightweight mode:\n - Large-scale indexing where speed is critical\n - Scenarios where approximate token counts are acceptable\n - Memory-constrained environments\n - Initial prototyping and development\n\n Example:", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py", - "score": 0.015496521189120809, - "excerpt": "from typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:", - "content": "\"\"\"No-op clustering strategy for search results.\n\nNoOpStrategy returns all results ungrouped when clustering dependencies\nare not available or clustering is disabled.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\nclass NoOpStrategy(BaseClusteringStrategy):\n \"\"\"No-op clustering strategy that returns all results ungrouped.\n\n This strategy is used as a final fallback when no clustering dependencies", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py", - "score": 0.014896214896214899, - "excerpt": "from typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:", - "content": "\"\"\"DBSCAN-based clustering strategy for search results.\n\nDBSCAN (Density-Based Spatial Clustering of Applications with Noise)\nis the fallback clustering strategy when HDBSCAN is not available.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\nclass DBSCANStrategy(BaseClusteringStrategy):\n \"\"\"DBSCAN-based clustering strategy.\n\n Uses sklearn's DBSCAN algorithm as a fallback when HDBSCAN is not available.", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\mcp\\schema.py", - "score": 0.014112903225806453, - "excerpt": " definition: Optional[str] = None\n references: List[ReferenceInfo] = field(default_factory=list)\n related_symbols: List[RelatedSymbol] = field(default_factory=list)\n metadata: dict = field...", - "content": " definition: Optional[str] = None\n references: List[ReferenceInfo] = field(default_factory=list)\n related_symbols: List[RelatedSymbol] = field(default_factory=list)\n metadata: dict = field(default_factory=dict)\n\n def to_dict(self) -> dict:\n result = {\n \"version\": self.version,\n \"context_type\": self.context_type,\n \"metadata\": self.metadata,\n }\n\n if self.symbol:\n result[\"symbol\"] = self.symbol.to_dict()\n if self.definition:\n result[\"definition\"] = self.definition\n if self.references:\n result[\"references\"] = [r.to_dict() for r in self.references]\n if self.related_symbols:\n result[\"related_symbols\"] = [s.to_dict() for s in self.related_symbols]\n\n return result\n\n def to_json(self, indent: int = 2) -> str:\n return json.dumps(self.to_dict(), indent=indent)\n\n def to_prompt_injection(self) -> str:\n parts = [\"\"]\n\n if self.symbol:\n parts.append(f\"## Symbol: {self.symbol.name}\")\n parts.append(f\"Type: {self.symbol.kind}\")\n parts.append(f\"Location: {self.symbol.file_path}:{self.symbol.line_start}\")\n\n if self.definition:\n parts.append(\"\\n## Definition\")\n parts.append(f\"```\\n{self.definition}\\n```\")\n\n if self.references:\n parts.append(f\"\\n## References ({len(self.references)} found)\")\n for ref in self.references[:5]: \n parts.append(f\"- {ref.file_path}:{ref.line} ({ref.relationship_type})\")\n parts.append(f\" ```\\n {ref.context}\\n ```\")\n\n if self.related_symbols:\n parts.append(\"\\n## Related Symbols\")\n for sym in self.related_symbols[:10]: \n parts.append(f\"- {sym.name} ({sym.relationship})\")\n\n parts.append(\"\")\n return \"\\n\".join(parts)\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "score": 0.013999118165784833, - "excerpt": "\nclass LspBridge:\n \n DEFAULT_BRIDGE_URL = \"http://127.0.0.1:3457\"\n DEFAULT_TIMEOUT = 30.0 \n DEFAULT_CACHE_TTL = 300 \n DEFAULT_MAX_CACHE_SIZE = 1000 \n\n def __init__(\n self,\n...", - "content": "\nclass LspBridge:\n \n DEFAULT_BRIDGE_URL = \"http://127.0.0.1:3457\"\n DEFAULT_TIMEOUT = 30.0 \n DEFAULT_CACHE_TTL = 300 \n DEFAULT_MAX_CACHE_SIZE = 1000 \n\n def __init__(\n self,\n bridge_url: str = DEFAULT_BRIDGE_URL,\n timeout: float = DEFAULT_TIMEOUT,\n cache_ttl: int = DEFAULT_CACHE_TTL,\n max_cache_size: int = DEFAULT_MAX_CACHE_SIZE,\n use_vscode_bridge: bool = False,\n workspace_root: Optional[str] = None,\n config_file: Optional[str] = None,\n ):\n self.bridge_url = bridge_url\n self.timeout = timeout\n self.cache_ttl = cache_ttl\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\events.py", - "score": 0.013999118165784833, - "excerpt": "\nfrom __future__ import annotations\n\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom pathlib import Path\nfrom typing import List, Optional, Set\n\n\nclass ChangeType(Enum)...", - "content": "\nfrom __future__ import annotations\n\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom pathlib import Path\nfrom typing import List, Optional, Set\n\n\nclass ChangeType(Enum):\n CREATED = \"created\"\n MODIFIED = \"modified\"\n DELETED = \"deleted\"\n MOVED = \"moved\"\n\n\n@dataclass\nclass FileEvent:\n path: Path\n change_type: ChangeType\n timestamp: float\n old_path: Optional[Path] = None \n\n\n@dataclass\nclass WatcherConfig:\n debounce_ms: int = 60000 \n ignored_patterns: Set[str] = field(default_factory=lambda: {\n \n \".git\", \".svn\", \".hg\",\n \n \".venv\", \"venv\", \"env\", \"__pycache__\", \".pytest_cache\", \".mypy_cache\", \".ruff_cache\",\n \n \"node_modules\", \"bower_components\", \".npm\", \".yarn\",\n \n \"dist\", \"build\", \"out\", \"target\", \"bin\", \"obj\", \"_build\", \"coverage\", \"htmlcov\",\n \n \".idea\", \".vscode\", \".vs\", \".eclipse\",\n \n \".codexlens\",\n \n \".cache\", \".parcel-cache\", \".turbo\", \".next\", \".nuxt\",\n \n \"logs\", \"tmp\", \"temp\",\n })\n languages: Optional[List[str]] = None \n\n\n@dataclass\nclass PendingQueueStatus:\n file_count: int = 0\n files: List[str] = field(default_factory=list) \n countdown_seconds: int = 0\n last_event_time: Optional[float] = None\n\n\n@dataclass\nclass IndexResult:\n files_indexed: int = 0\n files_removed: int = 0\n symbols_added: int = 0\n symbols_removed: int = 0\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "score": 0.013902465515368743, - "excerpt": "\nfrom __future__ import annotations\n\nimport platform\nimport sqlite3\nimport threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional...", - "content": "\nfrom __future__ import annotations\n\nimport platform\nimport sqlite3\nimport threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional\n\nfrom codexlens.errors import StorageError\n\n\n@dataclass\nclass ProjectInfo:\n\n id: int\n source_root: Path\n index_root: Path\n created_at: float\n last_indexed: float\n total_files: int\n total_dirs: int\n status: str\n\n\n@dataclass\nclass DirMapping:\n\n id: int\n project_id: int\n source_path: Path\n index_path: Path\n depth: int\n files_count: int\n last_updated: float\n\n\nclass RegistryStore:\n\n DEFAULT_DB_PATH = Path.home() / \".codexlens\" / \"registry.db\"\n\n def __init__(self, db_path: Path | None = None) -> None:\n self.db_path = (db_path or self.DEFAULT_DB_PATH).resolve()\n self._lock = threading.RLock()\n self._local = threading.local()\n self._pool_lock = threading.Lock()\n self._pool: Dict[int, sqlite3.Connection] = {}\n self._pool_generation = 0\n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_001_normalize_keywords.py", - "score": 0.013678451178451179, - "excerpt": " PRIMARY KEY (file_id, keyword_id),\n FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,\n FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCAD...", - "content": " PRIMARY KEY (file_id, keyword_id),\n FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,\n FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE\n )\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "score": 0.013661202185792351, - "excerpt": " project_root: Project root directory path\n symbol_name: Name of the symbol to find references for\n symbol_kind: Optional symbol kind filter (e.g., 'function', 'class')\n in...", - "content": "def find_references(\n project_root: str,\n symbol_name: str,\n symbol_kind: Optional[str] = None,\n include_definition: bool = True,\n group_by_definition: bool = True,\n limit: int = 100,\n) -> List[GroupedReferences]:\n \"\"\"Find all reference locations for a symbol.\n\n Multi-definition case returns grouped results to resolve ambiguity.\n\n This function wraps ChainSearchEngine.search_references() and groups\n the results by definition location. Each GroupedReferences contains\n a definition and all references that point to it.\n\n Args:\n project_root: Project root directory path\n symbol_name: Name of the symbol to find references for\n symbol_kind: Optional symbol kind filter (e.g., 'function', 'class')\n include_definition: Whether to include the definition location\n in the result (default True)\n group_by_definition: Whether to group references by definition.\n If False, returns a single group with all references.\n (default True)\n limit: Maximum number of references to return (default 100)\n\n Returns:\n List of GroupedReferences. Each group contains:\n - definition: The DefinitionResult for this symbol definition\n - references: List of ReferenceResult pointing to this definition\n\n Raises:\n ValueError: If project_root does not exist or is not a directory\n\n Examples:\n >>> refs = find_references(\"/path/to/project\", \"authenticate\")\n >>> for group in refs:\n ... print(f\"Definition: {group.definition.file_path}:{group.definition.line}\")\n ... for ref in group.references:\n ... print(f\" Reference: {ref.file_path}:{ref.line} ({ref.relationship})\")\n\n Note:\n Reference relationship types are normalized:\n - 'calls' -> 'call'\n - 'imports' -> 'import'\n - 'inherits' -> 'inheritance'\n \"\"\"\n # Validate and resolve project root\n project_path = resolve_project(project_root)\n\n # Import here to avoid circular imports\n from codexlens.config import Config\n from codexlens.storage.registry import RegistryStore\n from codexlens.storage.path_mapper import PathMapper\n from codexlens.storage.global_index import GlobalSymbolIndex\n from codexlens.search.chain_search import ChainSearchEngine\n from codexlens.search.chain_search import ReferenceResult as RawReferenceResult\n from codexlens.entities import Symbol\n\n # Initialize infrastructure\n config = Config()\n registry = RegistryStore()\n mapper = PathMapper(config.index_dir)\n\n # Create chain search engine\n engine = ChainSearchEngine(registry, mapper, config=config)\n\n try:\n # Step 1: Find definitions for the symbol\n definitions: List[DefinitionResult] = []\n\n if include_definition or group_by_definition:\n # Search for symbol definitions\n symbols = engine.search_symbols(\n name=symbol_name,\n source_path=project_path,\n kind=symbol_kind,\n )\n\n # Convert Symbol to DefinitionResult\n for sym in symbols:\n # Only include exact name matches for definitions\n if sym.name != symbol_name:\n continue\n\n # Optionally filter by kind\n if symbol_kind and sym.kind != symbol_kind:\n continue\n\n definitions.append(DefinitionResult(\n name=sym.name,\n kind=sym.kind,\n file_path=sym.file or \"\",\n line=sym.range[0] if sym.range else 1,\n end_line=sym.range[1] if sym.range else 1,\n signature=None, # Not available from Symbol\n container=None, # Not available from Symbol\n score=1.0,\n ))\n\n # Step 2: Get all references using ChainSearchEngine\n raw_references = engine.search_references(\n symbol_name=symbol_name,\n source_path=project_path,\n depth=-1,\n limit=limit,\n )\n\n # Step 3: Transform raw references to API ReferenceResult\n api_references: List[ReferenceResult] = []\n for raw_ref in raw_references:\n api_ref = _transform_to_reference_result(raw_ref)\n api_references.append(api_ref)\n\n # Step 4: Group references by definition\n if group_by_definition and definitions:\n return _group_references_by_definition(\n definitions=definitions,\n references=api_references,\n include_definition=include_definition,\n )\n else:\n # Return single group with placeholder definition or first definition\n if definitions:\n definition = definitions[0]\n else:\n # Create placeholder definition when no definition found\n definition = DefinitionResult(\n name=symbol_name,\n kind=symbol_kind or \"unknown\",\n file_path=\"\",\n line=0,\n end_line=0,\n signature=None,\n container=None,\n score=0.0,\n )\n\n return [GroupedReferences(\n definition=definition,\n references=api_references,\n )]\n\n finally:\n engine.close()", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\env_config.py", - "score": 0.01359062143375869, - "excerpt": " \n return default\n\n\ndef get_api_config(\n prefix: str,\n *,\n workspace_root: Path | None = None,\n defaults: Dict[str, Any] | None = None,\n) -> Dict[str, Any]:\n defaults = defaults o...", - "content": " \n return default\n\n\ndef get_api_config(\n prefix: str,\n *,\n workspace_root: Path | None = None,\n defaults: Dict[str, Any] | None = None,\n) -> Dict[str, Any]:\n defaults = defaults or {}\n \n config: Dict[str, Any] = {}\n \n \n field_mapping = {\n \"api_key\": f\"{prefix}_API_KEY\",\n \"api_base\": f\"{prefix}_API_BASE\",\n \"model\": f\"{prefix}_MODEL\",\n \"provider\": f\"{prefix}_PROVIDER\",\n \"timeout\": f\"{prefix}_TIMEOUT\",\n }\n \n for field, env_key in field_mapping.items():\n value = get_env(env_key, workspace_root=workspace_root)\n if value is not None:\n \n if field == \"timeout\":\n try:\n config[field] = float(value)\n except ValueError:\n pass\n else:\n config[field] = value\n elif field in defaults:\n config[field] = defaults[field]\n \n return config\n\n\ndef generate_env_example() -> str:\n lines = [\n \"# CodexLens Environment Configuration\",\n \"# Copy this file to .codexlens/.env and fill in your values\",\n \"\",\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\__init__.py", - "score": 0.01359062143375869, - "excerpt": " \"FileContextResult\",\n \"DefinitionResult\",\n \"ReferenceResult\",\n \"GroupedReferences\",\n \"SymbolInfo\",\n \"HoverInfo\",\n \"SemanticResult\",\n \n \"resolve_project\",\n \"normalize_rel...", - "content": " \"FileContextResult\",\n \"DefinitionResult\",\n \"ReferenceResult\",\n \"GroupedReferences\",\n \"SymbolInfo\",\n \"HoverInfo\",\n \"SemanticResult\",\n \n \"resolve_project\",\n \"normalize_relationship_type\",\n \"rank_by_proximity\",\n \"rank_by_score\",\n \n \"find_definition\",\n \"workspace_symbols\",\n \"get_hover\",\n \"file_context\",\n \"find_references\",\n \"semantic_search\",\n]\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_005_cleanup_unused_fields.py", - "score": 0.013517665130568358, - "excerpt": "\nimport logging\nfrom sqlite3 import Connection\n\nlog = logging.getLogger(__name__)\n\n\ndef upgrade(db_conn: Connection):\n cursor = db_conn.cursor()\n\n \n log.info(\"Checking semantic_metadata.keywo...", - "content": "\nimport logging\nfrom sqlite3 import Connection\n\nlog = logging.getLogger(__name__)\n\n\ndef upgrade(db_conn: Connection):\n cursor = db_conn.cursor()\n\n \n log.info(\"Checking semantic_metadata.keywords column...\")\n\n cursor.execute(\n \"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'\"\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py", - "score": 0.013495897553868569, - "excerpt": " >>> \n >>> from codexlens.search.clustering import FrequencyConfig\n >>> freq_config = FrequencyConfig(min_frequency=2, group_by=\"symbol\")\n >>> strategy = get_strategy(\"freq...", - "content": " >>> \n >>> from codexlens.search.clustering import FrequencyConfig\n >>> freq_config = FrequencyConfig(min_frequency=2, group_by=\"symbol\")\n >>> strategy = get_strategy(\"frequency\", freq_config)\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\onnx_reranker.py", - "score": 0.013480392156862746, - "excerpt": "\n from optimum.onnxruntime import ORTModelForSequenceClassification\n from transformers import AutoTokenizer\n\n if self.providers is None:\n from ..gpu_sup...", - "content": "\n from optimum.onnxruntime import ORTModelForSequenceClassification\n from transformers import AutoTokenizer\n\n if self.providers is None:\n from ..gpu_support import get_optimal_providers\n\n \n self.providers = get_optimal_providers(\n use_gpu=self.use_gpu, with_device_options=True\n )\n\n \n \n model_kwargs: dict[str, Any] = {}\n try:\n params = signature(ORTModelForSequenceClassification.from_pretrained).parameters\n if \"providers\" in params:\n model_kwargs[\"providers\"] = self.providers\n elif \"provider\" in params:\n provider_name = \"CPUExecutionProvider\"\n if self.providers:\n first = self.providers[0]\n provider_name = first[0] if isinstance(first, tuple) else str(first)\n model_kwargs[\"provider\"] = provider_name\n except Exception:\n model_kwargs = {}\n\n try:\n self._model = ORTModelForSequenceClassification.from_pretrained(\n self.model_name,\n **model_kwargs,\n )\n except TypeError:\n \n self._model = ORTModelForSequenceClassification.from_pretrained(self.model_name)\n\n self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)\n\n \n input_names: set[str] | None = None\n for attr in (\"input_names\", \"model_input_names\"):\n names = getattr(self._model, attr, None)\n if isinstance(names, (list, tuple)) and names:\n input_names = {str(n) for n in names}\n break\n if input_names is None:\n try:\n session = getattr(self._model, \"model\", None)\n if session is not None and hasattr(session, \"get_inputs\"):\n input_names = {i.name for i in session.get_inputs()}\n except Exception:\n input_names = None\n self._model_input_names = input_names\n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "score": 0.013403880070546739, - "excerpt": " return True\n\n except Exception as e:\n raise StorageError(f\"Failed to load ANN index: {e}\")\n\n def count(self) -> int:\n with self._lock:\n r...", - "content": " return True\n\n except Exception as e:\n raise StorageError(f\"Failed to load ANN index: {e}\")\n\n def count(self) -> int:\n with self._lock:\n return self._current_count\n\n @property\n def capacity(self) -> int:\n with self._lock:\n return self._max_elements\n\n @property\n def usage_ratio(self) -> float:\n with self._lock:\n if self._max_elements == 0:\n return 0.0\n return self._current_count / self._max_elements\n\n @property\n def is_loaded(self) -> bool:\n with self._lock:\n return self._index is not None and self._current_count > 0\n\n\n\nclass BinaryANNIndex:\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\litellm_embedder.py", - "score": 0.01322751322751323, - "excerpt": "class LiteLLMEmbedderWrapper(BaseEmbedder):\n \"\"\"Wrapper for ccw-litellm LiteLLMEmbedder.\n", - "content": "class LiteLLMEmbedderWrapper(BaseEmbedder):\n \"\"\"Wrapper for ccw-litellm LiteLLMEmbedder.\n\n This wrapper adapts the ccw-litellm LiteLLMEmbedder to the CodexLens\n BaseEmbedder interface, enabling seamless integration with CodexLens\n semantic search functionality.\n\n Args:\n model: Model identifier for LiteLLM (default: \"default\")\n **kwargs: Additional arguments passed to LiteLLMEmbedder\n\n Raises:\n ImportError: If ccw-litellm package is not installed\n \"\"\"\n\n def __init__(self, model: str = \"default\", **kwargs) -> None:\n \"\"\"Initialize LiteLLM embedder wrapper.\n\n Args:\n model: Model identifier for LiteLLM (default: \"default\")\n **kwargs: Additional arguments passed to LiteLLMEmbedder\n\n Raises:\n ImportError: If ccw-litellm package is not installed\n \"\"\"\n try:\n from ccw_litellm import LiteLLMEmbedder\n self._embedder = LiteLLMEmbedder(model=model, **kwargs)\n except ImportError as e:\n raise ImportError(\n \"ccw-litellm not installed. Install with: pip install ccw-litellm\"\n ) from e\n\n @property\n def embedding_dim(self) -> int:\n \"\"\"Return embedding dimensions from LiteLLMEmbedder.\n\n Returns:\n int: Dimension of the embedding vectors.\n \"\"\"\n return self._embedder.dimensions\n\n @property\n def model_name(self) -> str:\n \"\"\"Return model name from LiteLLMEmbedder.\n\n Returns:\n str: Name or identifier of the underlying model.\n \"\"\"\n return self._embedder.model_name\n\n @property\n def max_tokens(self) -> int:\n \"\"\"Return maximum token limit for the embedding model.\n\n Returns:\n int: Maximum number of tokens that can be embedded at once.\n Reads from LiteLLM config's max_input_tokens property.\n \"\"\"\n # Get from LiteLLM embedder's max_input_tokens property (now exposed)\n if hasattr(self._embedder, 'max_input_tokens'):\n return self._embedder.max_input_tokens\n\n # Fallback: infer from model name\n model_name_lower = self.model_name.lower()\n\n # Large models (8B or \"large\" in name)\n if '8b' in model_name_lower or 'large' in model_name_lower:\n return 32768\n\n # OpenAI text-embedding-3-* models\n if 'text-embedding-3' in model_name_lower:\n return 8191\n\n # Default fallback\n return 8192\n\n def _sanitize_text(self, text: str) -> str:\n \"\"\"Sanitize text to work around ModelScope API routing bug.\n\n ModelScope incorrectly routes text starting with lowercase 'import'\n to an Ollama endpoint, causing failures. This adds a leading space\n to work around the issue without affecting embedding quality.\n\n Args:\n text: Text to sanitize.\n\n Returns:\n Sanitized text safe for embedding API.\n \"\"\"\n if text.startswith('import'):\n return ' ' + text\n return text\n\n def embed_to_numpy(self, texts: str | Iterable[str], **kwargs) -> np.ndarray:\n \"\"\"Embed texts to numpy array using LiteLLMEmbedder.\n\n Args:\n texts: Single text or iterable of texts to embed.\n **kwargs: Additional arguments (ignored for LiteLLM backend).\n Accepts batch_size for API compatibility with fastembed.\n\n Returns:\n numpy.ndarray: Array of shape (n_texts, embedding_dim) containing embeddings.\n \"\"\"\n if isinstance(texts, str):\n texts = [texts]\n else:\n texts = list(texts)\n\n # Sanitize texts to avoid ModelScope routing bug\n texts = [self._sanitize_text(t) for t in texts]\n\n # LiteLLM handles batching internally, ignore batch_size parameter\n return self._embedder.embed(texts)\n\n def embed_single(self, text: str) -> list[float]:\n \"\"\"Generate embedding for a single text.\n\n Args:\n text: Text to embed.\n\n Returns:\n list[float]: Embedding vector as a list of floats.\n \"\"\"\n # Sanitize text before embedding\n sanitized = self._sanitize_text(text)\n embedding = self._embedder.embed([sanitized])\n return embedding[0].tolist()", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\association_tree\\builder.py", - "score": 0.013083213083213086, - "excerpt": "\nfrom __future__ import annotations\n\nimport asyncio\nimport logging\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set\n\nfrom codexlens.hybrid_search.data_structures import CallHierar...", - "content": "\nfrom __future__ import annotations\n\nimport asyncio\nimport logging\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set\n\nfrom codexlens.hybrid_search.data_structures import CallHierarchyItem, Range\nfrom codexlens.lsp.standalone_manager import StandaloneLspManager\nfrom .data_structures import CallTree, TreeNode\n\nlogger = logging.getLogger(__name__)\n\n\nclass AssociationTreeBuilder:\n\n def __init__(\n self,\n lsp_manager: StandaloneLspManager,\n timeout: float = 5.0,\n ):\n self.lsp_manager = lsp_manager\n self.timeout = timeout\n self.visited: Set[str] = set()\n\n async def build_tree(\n self,\n seed_file_path: str,\n seed_line: int,\n seed_character: int = 1,\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py", - "score": 0.012885154061624651, - "excerpt": "import re\nimport sqlite3\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple\n\ntry:\n from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser\nexcept Exception:...", - "content": "import re\nimport sqlite3\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple\n\ntry:\n from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser\nexcept Exception: \n TreeSitterSymbolParser = None \n\n\nclass SymbolExtractor:\n\n \n PATTERNS = {\n 'python': {\n 'function': r'^(?:async\\s+)?def\\s+(\\w+)\\s*\\(',\n 'class': r'^class\\s+(\\w+)\\s*[:\\(]',\n 'import': r'^(?:from\\s+([\\w.]+)\\s+)?import\\s+([\\w.,\\s]+)',\n 'call': r'(? Tuple[str, Optional[str], Optional[int], Optional[int]]:\n return (result.path, result.symbol_name, result.start_line, result.end_line)\n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py", - "score": 0.008680555555555556, - "excerpt": "from typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:", - "content": "\"\"\"HDBSCAN-based clustering strategy for search results.\n\nHDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise)\nis the primary clustering strategy for grouping similar search results.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\nclass HDBSCANStrategy(BaseClusteringStrategy):\n \"\"\"HDBSCAN-based clustering strategy.\n\n Uses HDBSCAN algorithm to cluster search results based on embedding similarity.", - "source": null, - "symbol": null - } - ], - "stats": { - "dirs_searched": 17, - "files_matched": 50, - "time_ms": 7219.313144683838 - } - } -} diff --git a/codex-lens/_tmp_verbose.json b/codex-lens/_tmp_verbose.json deleted file mode 100644 index 2593104d..00000000 --- a/codex-lens/_tmp_verbose.json +++ /dev/null @@ -1,95 +0,0 @@ -{ - "success": true, - "result": { - "query": "class Config", - "method": "cascade", - "count": 10, - "results": [ - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\hybrid_search\\data_structures.py", - "score": 0.06081658330145309, - "excerpt": " @classmethod\n def from_dict(cls, data: Dict[str, Any]) -> \"CallHierarchyItem\":\n return cls(\n name=data[\"name\"],\n kind=data[\"kind\"],\n file_path=data[\"file...", - "content": " @classmethod\n def from_dict(cls, data: Dict[str, Any]) -> \"CallHierarchyItem\":\n return cls(\n name=data[\"name\"],\n kind=data[\"kind\"],\n file_path=data[\"file_path\"],\n range=Range.from_dict(data[\"range\"]),\n detail=data.get(\"detail\"),\n )\n\n\n@dataclass\nclass CodeSymbolNode:\n\n id: str\n name: str\n kind: str\n file_path: str\n range: Range\n embedding: Optional[List[float]] = None\n raw_code: str = \"\"\n docstring: str = \"\"\n score: float = 0.0\n\n def __post_init__(self) -> None:\n if not self.id:\n raise ValueError(\"id cannot be empty\")\n if not self.name:\n raise ValueError(\"name cannot be empty\")\n if not self.kind:\n raise ValueError(\"kind cannot be empty\")\n if not self.file_path:\n raise ValueError(\"file_path cannot be empty\")\n\n def __hash__(self) -> int:\n return hash(self.id)\n\n def __eq__(self, other: object) -> bool:\n if not isinstance(other, CodeSymbolNode):\n return False\n return self.id == other.id\n\n def to_dict(self) -> Dict[str, Any]:\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "score": 0.056576452190618645, - "excerpt": "from rich.table import Table\n\nfrom codexlens.config import Config\nfrom codexlens.entities import IndexedFile, SearchResult, Symbol\nfrom codexlens.errors import CodexLensError, ConfigError, ParseError,...", - "content": "import os\nimport shutil\nimport sqlite3\nfrom pathlib import Path\nfrom typing import Annotated, Any, Dict, Iterable, List, Optional\n\nimport typer\nfrom rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn\nfrom rich.table import Table\n\nfrom codexlens.config import Config\nfrom codexlens.entities import IndexedFile, SearchResult, Symbol\nfrom codexlens.errors import CodexLensError, ConfigError, ParseError, StorageError, SearchError\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore, ProjectInfo\nfrom codexlens.storage.index_tree import IndexTreeBuilder\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.search.chain_search import ChainSearchEngine, SearchOptions\nfrom codexlens.watcher import WatcherManager, WatcherConfig\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "score": 0.05655744432847353, - "excerpt": "\"\"\"Configuration system for CodexLens.\"\"\"\n\nfrom __future__ import annotations", - "content": "\"\"\"Configuration system for CodexLens.\"\"\"\n\nfrom __future__ import annotations\n\nimport json\nimport logging\nimport os\nfrom dataclasses import dataclass, field\nfrom functools import cached_property\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "score": 0.049219375000264694, - "excerpt": "\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CH...", - "content": "\"\"\"Chain search engine for recursive multi-directory searching.\n\nProvides parallel search across directory hierarchies using indexed _index.db files.\nSupports depth-limited traversal, result aggregation, and symbol search.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CHECKING\nimport json\nimport logging\nimport os\nimport time\n\nfrom codexlens.entities import SearchResult, Symbol\n\nif TYPE_CHECKING:", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\embedding.py", - "score": 0.047931429239828446, - "excerpt": " def __init__(\n self,\n model_name: Optional[str] = None,\n use_gpu: bool = True,\n expand_dim: bool = True,\n ) -> None:\n from codexlens.semantic import SEMANTIC_...", - "content": " def __init__(\n self,\n model_name: Optional[str] = None,\n use_gpu: bool = True,\n expand_dim: bool = True,\n ) -> None:\n from codexlens.semantic import SEMANTIC_AVAILABLE\n\n if not SEMANTIC_AVAILABLE:\n raise ImportError(\n \"Semantic search dependencies not available. \"\n \"Install with: pip install codexlens[semantic]\"\n )\n\n self._model_name = model_name or self.DEFAULT_MODEL\n self._use_gpu = use_gpu\n self._expand_dim = expand_dim\n self._model = None\n self._native_dim: Optional[int] = None\n\n \n self._expansion_matrix: Optional[np.ndarray] = None\n\n @property\n def model_name(self) -> str:\n return self._model_name\n\n @property\n def embedding_dim(self) -> int:\n if self._expand_dim:\n return self.TARGET_DIM\n \n if self._native_dim is not None:\n return self._native_dim\n \n model_dims = {\n \"BAAI/bge-large-en-v1.5\": 1024,\n \"BAAI/bge-base-en-v1.5\": 768,\n \"BAAI/bge-small-en-v1.5\": 384,\n \"intfloat/multilingual-e5-large\": 1024,\n }\n return model_dims.get(self._model_name, 1024)\n\n @property\n def max_tokens(self) -> int:\n return 512 \n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "score": 0.04283104206542711, - "excerpt": "import threading\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom typing import Any, Dict, Iterable, List, Optional", - "content": "Provides intelligent load balancing across multiple LiteLLM embedding endpoints\nto maximize throughput while respecting rate limits.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport random\nimport threading\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom typing import Any, Dict, Iterable, List, Optional\n\nimport numpy as np\n\nfrom .base import BaseEmbedder\n\nlogger = logging.getLogger(__name__)\n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "score": 0.036886112765573215, - "excerpt": "- Direct subprocess spawning of language servers\n- JSON-RPC 2.0 communication over stdin/stdout\n- Multi-language support via configuration file (lsp-servers.json)\n- Process lifecycle management with a...", - "content": "\"\"\"Standalone Language Server Manager for direct LSP communication.\n\nThis module provides direct communication with language servers via JSON-RPC over stdio,\neliminating the need for VSCode Bridge. Similar to cclsp architecture.\n\nFeatures:\n- Direct subprocess spawning of language servers\n- JSON-RPC 2.0 communication over stdin/stdout\n- Multi-language support via configuration file (lsp-servers.json)\n- Process lifecycle management with auto-restart\n- Compatible interface with existing LspBridge\n\"\"\"\n\nfrom __future__ import annotations\n\nimport asyncio\nimport json\nimport logging\nimport os", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\models.py", - "score": 0.03448209080810879, - "excerpt": " container: Containing class/module (if any)\n score: Match score for ranking\n return {k: v for k, v in asdict(self).items() if v is not None}\n\n\n# =================================...", - "content": " container: Containing class/module (if any)\n score: Match score for ranking\n return {k: v for k, v in asdict(self).items() if v is not None}\n\n\n# =============================================================================\n# Section 4.4: find_references dataclasses\n# =============================================================================\n\n@dataclass\nclass ReferenceResult:\n file_path: str\n line: int\n column: int\n context_line: str\n relationship: str # call | import | type_annotation | inheritance\n\n def to_dict(self) -> dict:\n return asdict(self)\n\n\n@dataclass\nclass GroupedReferences:\n definition: DefinitionResult\n references: List[ReferenceResult] = field(default_factory=list)\n\n def to_dict(self) -> dict:\n return {\n \"definition\": self.definition.to_dict(),\n \"references\": [r.to_dict() for r in self.references],\n }\n\n\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py", - "score": 0.03341093379138448, - "excerpt": "\n if TREE_SITTER_AVAILABLE:\n self._initialize_parser()\n\n def _initialize_parser(self) -> None:\n if TreeSitterParser is None or TreeSitterLanguage is None:\n retur...", - "content": "\n if TREE_SITTER_AVAILABLE:\n self._initialize_parser()\n\n def _initialize_parser(self) -> None:\n if TreeSitterParser is None or TreeSitterLanguage is None:\n return\n\n try:\n \n if self.language_id == \"python\":\n import tree_sitter_python\n self._language = TreeSitterLanguage(tree_sitter_python.language())\n elif self.language_id == \"javascript\":\n import tree_sitter_javascript\n self._language = TreeSitterLanguage(tree_sitter_javascript.language())\n elif self.language_id == \"typescript\":\n import tree_sitter_typescript\n \n if self.path is not None and self.path.suffix.lower() == \".tsx\":\n self._language = TreeSitterLanguage(tree_sitter_typescript.language_tsx())\n else:\n self._language = TreeSitterLanguage(tree_sitter_typescript.language_typescript())\n else:\n return\n\n \n self._parser = TreeSitterParser()\n if hasattr(self._parser, \"set_language\"):\n self._parser.set_language(self._language) \n else:\n self._parser.language = self._language \n\n except Exception:\n \n self._parser = None\n self._language = None\n\n def is_available(self) -> bool:\n return self._parser is not None and self._language is not None\n\n def _parse_tree(self, text: str) -> Optional[tuple[bytes, TreeSitterNode]]:\n if not self.is_available() or self._parser is None:\n", - "source": null, - "symbol": null - }, - { - "path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py", - "score": 0.029568673189485736, - "excerpt": "\nimport logging\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import List, Optional", - "content": "\"\"\"Incremental indexer for processing file changes.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import List, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.storage.global_index import GlobalSymbolIndex\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore\n", - "source": null, - "symbol": null - } - ], - "stats": { - "dirs_searched": 17, - "files_matched": 10, - "time_ms": 6667.8361892700195 - } - } -} diff --git a/delete-vscode-bridge.js b/delete-vscode-bridge.js deleted file mode 100644 index a86acea6..00000000 --- a/delete-vscode-bridge.js +++ /dev/null @@ -1,25 +0,0 @@ -import { rmSync } from 'fs'; -import { fileURLToPath } from 'url'; -import { dirname, join } from 'path'; - -const __filename = fileURLToPath(import.meta.url); -const __dirname = dirname(__filename); - -try { - // Delete ccw-vscode-bridge directory - const bridgeDir = join(__dirname, 'ccw-vscode-bridge'); - console.log('Deleting:', bridgeDir); - rmSync(bridgeDir, { recursive: true, force: true }); - console.log('✓ Deleted ccw-vscode-bridge directory'); - - // Delete vscode-lsp.ts file - const vscodeLspFile = join(__dirname, 'ccw', 'src', 'tools', 'vscode-lsp.ts'); - console.log('Deleting:', vscodeLspFile); - rmSync(vscodeLspFile, { force: true }); - console.log('✓ Deleted vscode-lsp.ts file'); - - console.log('\nCleanup complete!'); -} catch (error) { - console.error('Error:', error.message); - process.exit(1); -}