diff --git a/codex-lens/.coverage b/codex-lens/.coverage deleted file mode 100644 index 57c05009..00000000 Binary files a/codex-lens/.coverage and /dev/null differ diff --git a/codex-lens/.env.example b/codex-lens/.env.example deleted file mode 100644 index 67a5989b..00000000 --- a/codex-lens/.env.example +++ /dev/null @@ -1,71 +0,0 @@ -# CodexLens Environment Configuration -# -# Configuration locations (copy to one of these): -# - ~/.codexlens/.env (global, applies to all projects) -# - project/.codexlens/.env (workspace-local) -# - project/.env (project root) -# -# Priority order (later overrides earlier): -# 1. Environment variables (already set in shell) - highest -# 2. .codexlens/.env (workspace-local) -# 3. .env (project root) -# 4. ~/.codexlens/.env (global) - lowest - -# ============================================ -# RERANKER Configuration -# ============================================ - -# API key for reranker service (SiliconFlow/Cohere/Jina) -# Required for 'api' backend -# RERANKER_API_KEY=sk-xxxx - -# Base URL for reranker API (overrides provider default) -# SiliconFlow: https://api.siliconflow.cn -# Cohere: https://api.cohere.ai -# Jina: https://api.jina.ai -# RERANKER_API_BASE=https://api.siliconflow.cn - -# Reranker provider: siliconflow, cohere, jina -# RERANKER_PROVIDER=siliconflow - -# Reranker model name -# SiliconFlow: BAAI/bge-reranker-v2-m3 -# Cohere: rerank-english-v3.0 -# Jina: jina-reranker-v2-base-multilingual -# RERANKER_MODEL=BAAI/bge-reranker-v2-m3 - -# ============================================ -# EMBEDDING Configuration -# ============================================ - -# API key for embedding service (for litellm backend) -# EMBEDDING_API_KEY=sk-xxxx - -# Base URL for embedding API -# EMBEDDING_API_BASE=https://api.openai.com - -# Embedding model name -# EMBEDDING_MODEL=text-embedding-3-small - -# ============================================ -# LITELLM Configuration -# ============================================ - -# API key for LiteLLM (for litellm reranker backend) -# LITELLM_API_KEY=sk-xxxx - -# Base URL for LiteLLM -# LITELLM_API_BASE= - -# LiteLLM model name -# LITELLM_MODEL=gpt-4o-mini - -# ============================================ -# General Configuration -# ============================================ - -# Custom data directory path (default: ~/.codexlens) -# CODEXLENS_DATA_DIR=~/.codexlens - -# Enable debug mode (true/false) -# CODEXLENS_DEBUG=false diff --git a/codex-lens/.github/workflows/security.yml b/codex-lens/.github/workflows/security.yml deleted file mode 100644 index 0ee464df..00000000 --- a/codex-lens/.github/workflows/security.yml +++ /dev/null @@ -1,70 +0,0 @@ -# Security scanning workflow for codex-lens -# Runs pip-audit to check for known vulnerabilities in dependencies - -name: Security Scan - -on: - # Run on push to main branch - push: - branches: - - main - - master - # Run weekly on Sundays at 00:00 UTC - schedule: - - cron: '0 0 * * 0' - # Allow manual trigger - workflow_dispatch: - -jobs: - security-audit: - name: Dependency Vulnerability Scan - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - cache: 'pip' - - - name: Install pip-audit - run: | - python -m pip install --upgrade pip - pip install pip-audit - - - name: Run pip-audit on requirements.in - run: pip-audit --requirement requirements.in - continue-on-error: false - - - name: Run pip-audit on pyproject.toml dependencies - run: pip-audit --project-path . - continue-on-error: false - - - name: Check for safety issues - run: | - pip install safety - safety check --json || true - continue-on-error: true - - bandit-security: - name: Code Security Linting - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - - name: Install bandit - run: pip install bandit[toml] - - - name: Run bandit security linter - run: bandit -r src/ -ll -i - continue-on-error: true diff --git a/codex-lens/.pytest-temp/test_builder_loads_saved_ignor0/codexlens-home/settings.json b/codex-lens/.pytest-temp/test_builder_loads_saved_ignor0/codexlens-home/settings.json deleted file mode 100644 index 1e88a530..00000000 --- a/codex-lens/.pytest-temp/test_builder_loads_saved_ignor0/codexlens-home/settings.json +++ /dev/null @@ -1 +0,0 @@ -{"ignore_patterns": ["frontend/dist"], "extension_filters": ["*.min.js"]} \ No newline at end of file diff --git a/codex-lens/.pytest-temp/test_builder_loads_saved_ignor0/frontend/app.ts b/codex-lens/.pytest-temp/test_builder_loads_saved_ignor0/frontend/app.ts deleted file mode 100644 index a1fe9822..00000000 --- a/codex-lens/.pytest-temp/test_builder_loads_saved_ignor0/frontend/app.ts +++ /dev/null @@ -1 +0,0 @@ -export const app = 1 diff --git a/codex-lens/.pytest-temp/test_builder_loads_saved_ignor0/frontend/bundle.min.js b/codex-lens/.pytest-temp/test_builder_loads_saved_ignor0/frontend/bundle.min.js deleted file mode 100644 index 840b8f69..00000000 --- a/codex-lens/.pytest-temp/test_builder_loads_saved_ignor0/frontend/bundle.min.js +++ /dev/null @@ -1 +0,0 @@ -export const bundle = 1 diff --git a/codex-lens/.pytest-temp/test_builder_loads_saved_ignor0/frontend/dist/compiled.ts b/codex-lens/.pytest-temp/test_builder_loads_saved_ignor0/frontend/dist/compiled.ts deleted file mode 100644 index 5e28e82e..00000000 --- a/codex-lens/.pytest-temp/test_builder_loads_saved_ignor0/frontend/dist/compiled.ts +++ /dev/null @@ -1 +0,0 @@ -export const compiled = 1 diff --git a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_res0/frontend/dist/bundle.ts b/codex-lens/.pytest-temp/test_collect_dirs_by_depth_res0/frontend/dist/bundle.ts deleted file mode 100644 index 840b8f69..00000000 --- a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_res0/frontend/dist/bundle.ts +++ /dev/null @@ -1 +0,0 @@ -export const bundle = 1 diff --git a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_res0/frontend/src/app.ts b/codex-lens/.pytest-temp/test_collect_dirs_by_depth_res0/frontend/src/app.ts deleted file mode 100644 index a1fe9822..00000000 --- a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_res0/frontend/src/app.ts +++ /dev/null @@ -1 +0,0 @@ -export const app = 1 diff --git a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/.next/generated.py b/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/.next/generated.py deleted file mode 100644 index d3504097..00000000 --- a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/.next/generated.py +++ /dev/null @@ -1 +0,0 @@ -print('artifact') diff --git a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/.parcel-cache/generated.py b/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/.parcel-cache/generated.py deleted file mode 100644 index d3504097..00000000 --- a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/.parcel-cache/generated.py +++ /dev/null @@ -1 +0,0 @@ -print('artifact') diff --git a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/.turbo/generated.py b/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/.turbo/generated.py deleted file mode 100644 index d3504097..00000000 --- a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/.turbo/generated.py +++ /dev/null @@ -1 +0,0 @@ -print('artifact') diff --git a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/build/generated.py b/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/build/generated.py deleted file mode 100644 index d3504097..00000000 --- a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/build/generated.py +++ /dev/null @@ -1 +0,0 @@ -print('artifact') diff --git a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/coverage/generated.py b/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/coverage/generated.py deleted file mode 100644 index d3504097..00000000 --- a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/coverage/generated.py +++ /dev/null @@ -1 +0,0 @@ -print('artifact') diff --git a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/dist/generated.py b/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/dist/generated.py deleted file mode 100644 index d3504097..00000000 --- a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/dist/generated.py +++ /dev/null @@ -1 +0,0 @@ -print('artifact') diff --git a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/out/generated.py b/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/out/generated.py deleted file mode 100644 index d3504097..00000000 --- a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/out/generated.py +++ /dev/null @@ -1 +0,0 @@ -print('artifact') diff --git a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/src/app.py b/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/src/app.py deleted file mode 100644 index 1fca9fb5..00000000 --- a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/src/app.py +++ /dev/null @@ -1 +0,0 @@ -print('ok') diff --git a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/target/generated.py b/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/target/generated.py deleted file mode 100644 index d3504097..00000000 --- a/codex-lens/.pytest-temp/test_collect_dirs_by_depth_ski0/target/generated.py +++ /dev/null @@ -1 +0,0 @@ -print('artifact') diff --git a/codex-lens/.pytest-temp/test_iter_source_files_respect0/frontend/app.ts b/codex-lens/.pytest-temp/test_iter_source_files_respect0/frontend/app.ts deleted file mode 100644 index a1fe9822..00000000 --- a/codex-lens/.pytest-temp/test_iter_source_files_respect0/frontend/app.ts +++ /dev/null @@ -1 +0,0 @@ -export const app = 1 diff --git a/codex-lens/.pytest-temp/test_iter_source_files_respect0/frontend/bundle.min.js b/codex-lens/.pytest-temp/test_iter_source_files_respect0/frontend/bundle.min.js deleted file mode 100644 index 840b8f69..00000000 --- a/codex-lens/.pytest-temp/test_iter_source_files_respect0/frontend/bundle.min.js +++ /dev/null @@ -1 +0,0 @@ -export const bundle = 1 diff --git a/codex-lens/.pytest-temp/test_iter_source_files_respect0/frontend/skip.ts b/codex-lens/.pytest-temp/test_iter_source_files_respect0/frontend/skip.ts deleted file mode 100644 index 8e5bef4c..00000000 --- a/codex-lens/.pytest-temp/test_iter_source_files_respect0/frontend/skip.ts +++ /dev/null @@ -1 +0,0 @@ -export const skip = 1 diff --git a/codex-lens/.pytest-temp/test_load_settings_reads_ignor0/settings.json b/codex-lens/.pytest-temp/test_load_settings_reads_ignor0/settings.json deleted file mode 100644 index 8af70431..00000000 --- a/codex-lens/.pytest-temp/test_load_settings_reads_ignor0/settings.json +++ /dev/null @@ -1 +0,0 @@ -{"ignore_patterns": ["frontend/dist", "coverage"], "extension_filters": ["*.min.js", "*.map"]} \ No newline at end of file diff --git a/codex-lens/.pytest-temp/test_should_index_dir_ignores_0/package/dist/bundle.py b/codex-lens/.pytest-temp/test_should_index_dir_ignores_0/package/dist/bundle.py deleted file mode 100644 index 89755a72..00000000 --- a/codex-lens/.pytest-temp/test_should_index_dir_ignores_0/package/dist/bundle.py +++ /dev/null @@ -1 +0,0 @@ -print('compiled') diff --git a/codex-lens/ASSOCIATION_TREE_IMPLEMENTATION.md b/codex-lens/ASSOCIATION_TREE_IMPLEMENTATION.md deleted file mode 100644 index 062882f1..00000000 --- a/codex-lens/ASSOCIATION_TREE_IMPLEMENTATION.md +++ /dev/null @@ -1,240 +0,0 @@ -# Association Tree Implementation Summary - -## Overview - -Successfully implemented LSP-based association tree search for CodexLens. The implementation consists of two core components that work together to discover and rank code relationships using Language Server Protocol (LSP) call hierarchy capabilities. - -## Components Implemented - -### 1. AssociationTreeBuilder (`src/codexlens/search/association_tree/builder.py`) - -**Purpose**: Build call relationship trees from seed locations using LSP - -**Key Features**: -- Depth-first recursive expansion from seed positions -- Supports bidirectional expansion: - - Incoming calls (callers) - who calls this function - - Outgoing calls (callees) - what this function calls -- Automatic cycle detection and marking -- Configurable max depth (default: 5) -- Async/await with parallel expansion -- Timeout handling (5s per LSP request) -- Graceful error handling - -**Core Methods**: -- `build_tree()`: Main entry point for tree construction -- `_expand_node()`: Recursive DFS expansion -- `_expand_incoming_calls()`: Process callers -- `_expand_outgoing_calls()`: Process callees - -### 2. ResultDeduplicator (`src/codexlens/search/association_tree/deduplicator.py`) - -**Purpose**: Extract unique nodes from trees and assign relevance scores - -**Scoring Algorithm**: -``` -Score = 0.4 * depth_score + 0.3 * frequency_score + 0.3 * kind_score - -where: -- depth_score: 1.0 at depth 0, decreasing to 0.0 at depth 10 -- frequency_score: occurrences / max_occurrences -- kind_score: function/method (1.0) > class (0.8) > variable (0.4) -``` - -**Key Features**: -- Deduplication by (file_path, start_line, end_line) -- Merge duplicate nodes across different paths -- Track minimum depth and occurrence count -- Configurable score weights -- Filter by kind or file pattern -- JSON serialization support - -### 3. Data Structures (`src/codexlens/search/association_tree/data_structures.py`) - -**TreeNode**: -- Represents a single node in the call tree -- Tracks depth, parents, children, paths -- Marks circular references - -**CallTree**: -- Complete tree structure with roots and edges -- Node lookup by ID -- Edge tracking for relationship visualization - -**UniqueNode**: -- Deduplicated result with metadata -- Aggregates multiple occurrences -- Contains relevance score - -## Integration with StandaloneLspManager - -Extended `StandaloneLspManager` with missing method: - -**Added**: `get_outgoing_calls()` method (`src/codexlens/lsp/standalone_manager.py:1057-1086`) - -This method complements the existing `get_incoming_calls()` to enable bidirectional call tree traversal. - -## Testing - -Comprehensive test suite with 9 tests covering: - -1. **Simple tree building**: Basic tree construction -2. **Cycle detection**: Circular reference handling -3. **Max depth limits**: Depth boundary enforcement -4. **Empty trees**: Edge case handling -5. **Basic deduplication**: Node merging logic -6. **Scoring algorithm**: Relevance ranking -7. **Max results limit**: Result pagination -8. **Kind filtering**: Symbol type filtering -9. **Serialization**: JSON export - -**Test Results**: All 9 tests passing ✅ - -**Test File**: `tests/test_association_tree.py` - -## Usage Example - -```python -import asyncio -from codexlens.lsp.standalone_manager import StandaloneLspManager -from codexlens.search.association_tree import ( - AssociationTreeBuilder, - ResultDeduplicator, -) - -async def search_with_association_tree(file_path: str, line: int): - async with StandaloneLspManager(workspace_root="/path/to/project") as lsp: - # Build tree - builder = AssociationTreeBuilder(lsp) - tree = await builder.build_tree( - seed_file_path=file_path, - seed_line=line, - max_depth=5, - expand_callers=True, - expand_callees=True, - ) - - # Deduplicate and score - deduplicator = ResultDeduplicator() - unique_nodes = deduplicator.deduplicate(tree, max_results=20) - - # Return results - return deduplicator.to_dict_list(unique_nodes) - -# Run -results = asyncio.run(search_with_association_tree("src/main.py", 42)) -``` - -## Integration Point - -The components can be integrated into `HybridSearchEngine`: - -```python -# In hybrid_search.py -async def _search_association_tree(self, query: str, limit: int): - # 1. Get seed results from vector search - seed_results = await self._search_vector(query, limit=5) - - # 2. Build association trees - builder = AssociationTreeBuilder(self.lsp_manager) - tree = await builder.build_tree( - seed_file_path=seed_results[0].file_path, - seed_line=seed_results[0].line, - max_depth=5, - ) - - # 3. Deduplicate and rank - deduplicator = ResultDeduplicator() - unique_nodes = deduplicator.deduplicate(tree, max_results=limit) - - # 4. Convert to search results - return self._convert_to_search_results(unique_nodes) -``` - -## File Structure - -``` -src/codexlens/search/association_tree/ -├── __init__.py # Module exports -├── builder.py # AssociationTreeBuilder -├── data_structures.py # TreeNode, CallTree, UniqueNode -├── deduplicator.py # ResultDeduplicator -└── README.md # Documentation - -tests/ -└── test_association_tree.py # Unit tests (9 tests) - -examples/ -└── association_tree_demo.py # Demo script -``` - -## Performance Characteristics - -**Time Complexity**: -- Tree building: O(nodes * avg_calls) with max_depth limit -- Deduplication: O(n log n) for sorting - -**Space Complexity**: -- Tree: O(nodes + edges) -- Unique nodes: O(unique_symbols) - -**Typical Performance** (max_depth=5): -- Small codebase: < 1s -- Medium codebase: 1-3s -- Large codebase: 3-10s - -**Optimization Strategies**: -1. Limit max_depth (recommended: 3-5) -2. Use timeouts (default: 5s per node) -3. Enable parallel expansion (default: on) -4. Filter by symbol kind early - -## Error Handling - -The implementation handles: -- ✅ LSP timeouts (logs warning, continues) -- ✅ Missing call hierarchy support (returns empty tree) -- ✅ Connection failures (skips node, continues) -- ✅ Invalid LSP responses (logs error, skips) -- ✅ Circular references (marks cycle, stops recursion) -- ✅ Max depth exceeded (stops expansion) - -## Code Quality - -**Code Style**: -- Python 3.10+ features (type hints, dataclasses) -- Follows existing CodexLens conventions -- Comprehensive docstrings -- Async/await throughout - -**Testing**: -- 9 unit tests with mock LSP -- Edge cases covered -- 100% core logic coverage - -**Documentation**: -- Module README with examples -- Inline code documentation -- Demo script provided -- Integration guide included - -## Next Steps - -Recommended enhancements: - -1. **Multi-seed building**: Build trees from multiple seeds simultaneously -2. **Graph visualization**: Export to DOT/Mermaid format -3. **Incremental updates**: Update trees based on code changes -4. **Custom scoring**: Pluggable scoring functions -5. **Caching**: Cache frequently-accessed trees -6. **Cross-language support**: Extend beyond Python (TypeScript, Java, etc.) - -## Conclusion - -The association tree implementation provides a robust foundation for LSP-based code relationship discovery in CodexLens. All core components are implemented, tested, and ready for integration into the hybrid search engine. - -**Status**: ✅ Complete and tested -**Files Modified**: 4 -**Files Created**: 7 -**Tests Added**: 9 -**All Tests Passing**: Yes diff --git a/codex-lens/CHAIN_SEARCH_IMPLEMENTATION.md b/codex-lens/CHAIN_SEARCH_IMPLEMENTATION.md deleted file mode 100644 index f0792a39..00000000 --- a/codex-lens/CHAIN_SEARCH_IMPLEMENTATION.md +++ /dev/null @@ -1,245 +0,0 @@ -# Chain Search Implementation Summary - -## Files Created - -### 1. `D:\Claude_dms3\codex-lens\src\codexlens\search\__init__.py` -Module initialization file exporting all public classes and functions: -- `ChainSearchEngine` -- `SearchOptions` -- `SearchStats` -- `ChainSearchResult` -- `quick_search` - -### 2. `D:\Claude_dms3\codex-lens\src\codexlens\search\chain_search.py` -Complete implementation of the chain search engine (460+ lines) with: - -#### Classes - -**SearchOptions** -- Configuration dataclass for search behavior -- Controls depth, parallelism, result limits -- Supports files-only and symbol search modes - -**SearchStats** -- Search execution statistics -- Tracks directories searched, files matched, timing, errors - -**ChainSearchResult** -- Comprehensive search result container -- Includes results, symbols, and execution statistics - -**ChainSearchEngine** -- Main parallel search engine -- Thread-safe with ThreadPoolExecutor -- Supports recursive directory traversal -- Implements result aggregation and deduplication - -#### Key Methods - -**Public API:** -- `search()` - Main search with full results -- `search_files_only()` - Fast file path-only search -- `search_symbols()` - Symbol search across hierarchy - -**Internal Methods:** -- `_find_start_index()` - Locate starting index for source path -- `_collect_index_paths()` - Recursive index path collection via subdirs -- `_search_parallel()` - Parallel ThreadPoolExecutor search -- `_search_single_index()` - Single index search with error handling -- `_merge_and_rank()` - Result deduplication and ranking -- `_search_symbols_parallel()` - Parallel symbol search -- `_search_symbols_single()` - Single index symbol search - -**Convenience Function:** -- `quick_search()` - One-line search with auto-initialization - -## Implementation Features - -### 1. Chain Traversal -- Starts from source path, finds nearest index -- Recursively collects subdirectory indexes via `subdirs` table -- Supports depth limiting (-1 = unlimited, 0 = current only) -- Prevents duplicate traversal with visited set - -### 2. Parallel Execution -- Uses ThreadPoolExecutor for concurrent searches -- Configurable worker count (default: 8) -- Error-tolerant: individual index failures don't block overall search -- Collects results as futures complete - -### 3. Result Processing -- **Deduplication**: By file path, keeping highest score -- **Ranking**: BM25 score descending -- **Limiting**: Per-directory and total limits -- **Statistics**: Comprehensive execution metrics - -### 4. Search Modes -- **Full search**: Results with excerpts and scores -- **Files-only**: Fast path-only mode -- **Symbol search**: Cross-directory symbol lookup - -### 5. Error Handling -- Graceful degradation on index errors -- Missing index warnings logged -- Error tracking in SearchStats -- Non-blocking failure mode - -## Search Flow Example - -``` -search("auth", path="D:/project/src", depth=-1) - | - v - [1] _find_start_index - registry.find_index_path("D:/project/src") - -> ~/.codexlens/indexes/D/project/src/_index.db - | - v - [2] _collect_index_paths (chain traversal) - src/_index.db - +-- subdirs: [api, utils] - | - +-- api/_index.db - | +-- subdirs: [] - | - +-- utils/_index.db - +-- subdirs: [] - - Result: [src/_index.db, api/_index.db, utils/_index.db] - | - v - [3] _search_parallel (ThreadPoolExecutor) - Thread1: src/ -> FTS search - Thread2: api/ -> FTS search - Thread3: utils/ -> FTS search - | - v - [4] _merge_and_rank - - Deduplicate by path - - Sort by score descending - - Apply total_limit - | - v - ChainSearchResult -``` - -## Testing - -### Test File: `D:\Claude_dms3\codex-lens\test_chain_search.py` -Comprehensive test suite with four test functions: - -1. **test_basic_search()** - Full search with all options -2. **test_quick_search()** - Convenience function test -3. **test_symbol_search()** - Symbol search across hierarchy -4. **test_files_only_search()** - Fast file-only mode - -### Test Results -- All imports successful -- All tests pass without errors -- Returns empty results (expected - no indexes built yet) -- Logging shows proper "No index found" warnings -- No crashes or exceptions - -## Integration Points - -### Dependencies -- `codexlens.entities`: SearchResult, Symbol -- `codexlens.storage.registry`: RegistryStore, DirMapping -- `codexlens.storage.dir_index`: DirIndexStore, SubdirLink -- `codexlens.storage.path_mapper`: PathMapper - -### Thread Safety -- Uses ThreadPoolExecutor for parallel searches -- Each thread gets own DirIndexStore connection -- SQLite WAL mode supports concurrent reads -- Registry uses thread-local connections - -## Usage Examples - -### Basic Search -```python -from pathlib import Path -from codexlens.search import ChainSearchEngine -from codexlens.storage.registry import RegistryStore -from codexlens.storage.path_mapper import PathMapper - -registry = RegistryStore() -registry.initialize() -mapper = PathMapper() -engine = ChainSearchEngine(registry, mapper) - -result = engine.search("authentication", Path("D:/project/src")) -print(f"Found {len(result.results)} matches in {result.stats.time_ms:.2f}ms") -``` - -### Quick Search -```python -from pathlib import Path -from codexlens.search import quick_search - -results = quick_search("TODO", Path("D:/project"), depth=2) -for r in results[:5]: - print(f"{r.path}: {r.score:.2f}") -``` - -### Symbol Search -```python -symbols = engine.search_symbols("init", Path("D:/project"), kind="function") -for sym in symbols: - print(f"{sym.name} - lines {sym.range[0]}-{sym.range[1]}") -``` - -### Files-Only Mode -```python -paths = engine.search_files_only("config", Path("D:/project")) -print(f"Files with 'config': {len(paths)}") -``` - -## Performance Characteristics - -### Strengths -- **Parallel execution**: Multiple indexes searched concurrently -- **Lazy traversal**: Only loads needed subdirectories -- **Memory efficient**: Streaming results, no full tree in memory -- **Depth limiting**: Can restrict search scope - -### Considerations -- **First search slower**: Needs to traverse subdir links -- **Many small dirs**: Overhead from thread pool -- **Deep hierarchies**: Depth=-1 may be slow on large trees - -### Optimization Tips -- Use `depth` parameter to limit scope -- Use `limit_per_dir` to reduce per-index overhead -- Use `files_only=True` when excerpts not needed -- Reuse ChainSearchEngine instance for multiple searches - -## Code Quality - -### Standards Met -- **Type annotations**: Full typing on all methods -- **Docstrings**: Complete with examples and parameter docs -- **Error handling**: Graceful degradation, no crashes -- **ASCII-only**: Windows GBK compatible -- **No debug spam**: Clean logging at appropriate levels -- **Thread safety**: Proper locking and pooling - -### Design Patterns -- **Dataclasses**: Clean configuration and result objects -- **Context managers**: Proper resource cleanup -- **Dependency injection**: Registry and mapper passed in -- **Builder pattern**: SearchOptions for configuration -- **Template method**: _search_single_index extensible - -## Status: Complete and Tested - -All requirements met: -- [x] Parallel search with ThreadPoolExecutor -- [x] Chain traversal via subdirs links -- [x] Depth limiting -- [x] Error tolerance -- [x] Search statistics -- [x] Complete docstrings and type hints -- [x] Test suite passes -- [x] ASCII-only output (GBK compatible) -- [x] Integration with existing codebase diff --git a/codex-lens/CHANGELOG.md b/codex-lens/CHANGELOG.md deleted file mode 100644 index 86391b9e..00000000 --- a/codex-lens/CHANGELOG.md +++ /dev/null @@ -1,41 +0,0 @@ -# CodexLens – Optimization Plan Changelog - -This changelog tracks the **CodexLens optimization plan** milestones (not the Python package version in `pyproject.toml`). - -## v1.0 (Optimization) – 2025-12-26 - -### Optimizations - -1. **P0: Context-aware hybrid chunking** - - Docstrings are extracted into dedicated chunks and excluded from code chunks. - - Docstring chunks include `parent_symbol` metadata when the docstring belongs to a function/class/method. - - Sliding-window chunk boundaries are deterministic for identical input. - -2. **P1: Adaptive RRF weights (QueryIntent)** - - Query intent is classified as `keyword` / `semantic` / `mixed`. - - RRF weights adapt to intent: - - `keyword`: exact-heavy (favors lexical matches) - - `semantic`: vector-heavy (favors semantic matches) - - `mixed`: keeps base/default weights - -3. **P2: Symbol boost** - - Fused results with an explicit symbol match (`symbol_name`) receive a multiplicative boost (default `1.5x`). - -4. **P2: Embedding-based re-ranking (optional)** - - A second-stage ranker can reorder top results by semantic similarity. - - Re-ranking runs only when `Config.enable_reranking=True`. - -5. **P3: Global symbol index (incremental + fast path)** - - `GlobalSymbolIndex` stores project-wide symbols in one SQLite DB for fast symbol lookups. - - `ChainSearchEngine.search_symbols()` uses the global index fast path when enabled. - -### Migration Notes -- **Reindexing (recommended)**: deterministic chunking and docstring metadata affect stored chunks. For best results, regenerate indexes/embeddings after upgrading: - - Rebuild indexes and/or re-run embedding generation for existing projects. -- **New config flags**: - - `Config.enable_reranking` (default `False`) - - `Config.reranking_top_k` (default `50`) - - `Config.symbol_boost_factor` (default `1.5`) - - `Config.global_symbol_index_enabled` (default `True`) -- **Breaking changes**: none (behavioral improvements only). - diff --git a/codex-lens/DEPENDENCIES.md b/codex-lens/DEPENDENCIES.md deleted file mode 100644 index aad301da..00000000 --- a/codex-lens/DEPENDENCIES.md +++ /dev/null @@ -1,38 +0,0 @@ -# Dependency Management - -This project uses setuptools with `pyproject.toml` for dependency management. - -## Locking Dependencies - -To generate a fully pinned `requirements.txt` from `requirements.in`: - -```bash -# Install pip-tools -pip install pip-tools - -# Compile requirements -pip-compile requirements.in --output-file=requirements.txt - -# To upgrade dependencies -pip-compile --upgrade requirements.in --output-file=requirements.txt -``` - -## Version Constraints - -This project uses **pessimistic versioning** (`~=`) for dependency specifications per PEP 440: - -- `typer~=0.9.0` means: `>=0.9.0, ==0.9.*` -- Allows bugfix updates (0.9.0, 0.9.1, 0.9.2) but not feature/minor updates (0.10.0) - -This provides stability while allowing automatic patch updates. - -## Security Scanning - -The project includes automated security scanning via GitHub Actions: -- Runs on every push to main branch -- Runs weekly (Sundays at 00:00 UTC) -- Can be triggered manually - -The scan uses: -- `pip-audit`: Checks for known vulnerabilities in dependencies -- `bandit`: Security linter for Python code diff --git a/codex-lens/LICENSE b/codex-lens/LICENSE deleted file mode 100644 index 8e31ab6b..00000000 --- a/codex-lens/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2024 CodexLens Contributors - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/codex-lens/README.md b/codex-lens/README.md deleted file mode 100644 index 823ab5fa..00000000 --- a/codex-lens/README.md +++ /dev/null @@ -1,109 +0,0 @@ -# CodexLens - -CodexLens is a multi-modal code analysis platform designed to provide comprehensive code understanding and analysis capabilities. - -## Features - -- **Multi-language Support**: Analyze code in Python, JavaScript, TypeScript and more using Tree-sitter parsers -- **Semantic Search**: Find relevant code snippets using semantic understanding with fastembed and HNSWLIB -- **Code Parsing**: Advanced code structure parsing with tree-sitter -- **Flexible Architecture**: Modular design for easy extension and customization - -## Installation - -### Basic Installation - -```bash -pip install codex-lens -``` - -### With Semantic Search - -```bash -pip install codex-lens[semantic] -``` - -### With GPU Acceleration (NVIDIA CUDA) - -```bash -pip install codex-lens[semantic-gpu] -``` - -### With DirectML (Windows - NVIDIA/AMD/Intel) - -```bash -pip install codex-lens[semantic-directml] -``` - -### With All Optional Features - -```bash -pip install codex-lens[full] -``` - -### Local ONNX Reranker Bootstrap - -Use the pinned bootstrap flow when you want the local-only reranker backend in an -existing CodexLens virtual environment without asking pip to resolve the whole -project extras set at once. - -1. Start from the CodexLens repo root and create or activate the project venv. -2. Review the pinned install manifest in `scripts/requirements-reranker-local.txt`. -3. Render the deterministic setup plan: - -```bash -python scripts/bootstrap_reranker_local.py --dry-run -``` - -The bootstrap script always targets the selected venv Python, installs the local -ONNX reranker stack in a fixed order, and keeps the package set pinned to the -validated Python 3.13-compatible combination: - -- `numpy==2.4.0` -- `onnxruntime==1.23.2` -- `huggingface-hub==0.36.2` -- `transformers==4.53.3` -- `optimum[onnxruntime]==2.1.0` - -When you are ready to apply it to the CodexLens venv, use: - -```bash -python scripts/bootstrap_reranker_local.py --apply -``` - -To pre-download the default local reranker model (`Xenova/ms-marco-MiniLM-L-6-v2`) -into the repo-local Hugging Face cache, use: - -```bash -python scripts/bootstrap_reranker_local.py --apply --download-model -``` - -The dry-run plan also prints the equivalent explicit model download command. On -Windows PowerShell with the default repo venv, it looks like: - -```bash -.venv/Scripts/hf.exe download Xenova/ms-marco-MiniLM-L-6-v2 --local-dir .cache/huggingface/models/Xenova--ms-marco-MiniLM-L-6-v2 -``` - -After installation, probe the backend from the same venv: - -```bash -python scripts/bootstrap_reranker_local.py --apply --probe -``` - -## Requirements - -- Python >= 3.10 -- See `pyproject.toml` for detailed dependency list - -## Development - -This project uses setuptools for building and packaging. - -## License - -MIT License - -## Authors - -CodexLens Contributors diff --git a/codex-lens/SEMANTIC_SEARCH_USAGE.md b/codex-lens/SEMANTIC_SEARCH_USAGE.md deleted file mode 100644 index 381a33be..00000000 --- a/codex-lens/SEMANTIC_SEARCH_USAGE.md +++ /dev/null @@ -1,83 +0,0 @@ -# Semantic Search Integration - -## Overview -The ChainSearchEngine now supports semantic keyword search in addition to FTS5 full-text search. - -## Usage - -### Enable Semantic Search - -```python -from pathlib import Path -from codexlens.search.chain_search import ChainSearchEngine, SearchOptions -from codexlens.storage.registry import RegistryStore -from codexlens.storage.path_mapper import PathMapper - -# Initialize -registry = RegistryStore() -registry.initialize() -mapper = PathMapper() -engine = ChainSearchEngine(registry, mapper) - -# Create options with semantic search enabled -options = SearchOptions( - include_semantic=True, # Enable semantic keyword search - total_limit=50 -) - -# Execute search -result = engine.search("authentication", Path("./src"), options) - -# Results include both FTS and semantic matches -for r in result.results: - print(f"{r.path}: {r.score:.2f} - {r.excerpt}") -``` - -### How It Works - -1. **FTS Search**: Traditional full-text search using SQLite FTS5 -2. **Semantic Search**: Searches the `semantic_metadata.keywords` field -3. **Result Merging**: Semantic results are added with 0.8x weight - - FTS results: BM25 score from SQLite - - Semantic results: Base score of 10.0 * 0.8 = 8.0 -4. **Deduplication**: `_merge_and_rank()` deduplicates by path, keeping highest score - -### Result Format - -- **FTS results**: Regular excerpt from matched content -- **Semantic results**: `Keywords: keyword1, keyword2, keyword3, ...` - -### Prerequisites - -Files must have semantic metadata generated via: - -```bash -codex-lens enhance . --tool gemini -``` - -This uses CCW CLI to generate summaries, keywords, and purpose descriptions. - -## Implementation Details - -### Changes Made - -1. **SearchOptions**: Added `include_semantic: bool = False` parameter -2. **_search_parallel()**: Passes `include_semantic` to worker threads -3. **_search_single_index()**: - - Accepts `include_semantic` parameter - - Calls `DirIndexStore.search_semantic_keywords()` when enabled - - Converts semantic matches to `SearchResult` objects - - Applies 0.8x weight to semantic scores - -### Score Weighting - -```python -# FTS result (from BM25) -SearchResult(path="...", score=12.5, excerpt="...") - -# Semantic result (fixed weighted score) -SearchResult(path="...", score=8.0, excerpt="Keywords: ...") -``` - -The 0.8x weight ensures semantic matches rank slightly lower than direct FTS matches -but still appear in relevant results. diff --git a/codex-lens/benchmarks/accuracy_queries_ccw_smart_search.jsonl b/codex-lens/benchmarks/accuracy_queries_ccw_smart_search.jsonl deleted file mode 100644 index 737f88b6..00000000 --- a/codex-lens/benchmarks/accuracy_queries_ccw_smart_search.jsonl +++ /dev/null @@ -1,16 +0,0 @@ -{"query":"executeHybridMode dense_rerank semantic smart_search","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-semantic-routing","notes":"CCW semantic mode delegates to CodexLens dense_rerank."} -{"query":"parse CodexLens JSON output strip ANSI smart_search","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-json-fallback","notes":"Covers JSON/plain-text fallback handling for CodexLens output."} -{"query":"smart_search init embed search action schema","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-action-schema","notes":"Find the Zod schema that defines init/embed/search actions."} -{"query":"auto init missing job dedupe smart_search","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-auto-init","notes":"Targets background init/embed warmup and dedupe state."} -{"query":"smart_search exact mode fallback to CodexLens fts","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-exact-fallback","notes":"Tracks the exact-mode fallback path into CodexLens FTS."} -{"query":"smart_search settings snapshot embedding backend reranker backend staged stage2 mode","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-config-snapshot","notes":"Reads local config snapshot for embedding/reranker/staged pipeline settings."} -{"query":"embedding backend fastembed local litellm api config","relevant_paths":["codex-lens/src/codexlens/config.py"],"intent":"codexlens-embedding-config","notes":"Local-only benchmark should resolve to fastembed defaults."} -{"query":"reranker backend onnx api legacy configuration","relevant_paths":["codex-lens/src/codexlens/config.py","codex-lens/src/codexlens/env_config.py"],"intent":"codexlens-reranker-config","notes":"Covers both config dataclass fields and env overrides."} -{"query":"staged stage2 mode precomputed realtime static_global_graph","relevant_paths":["codex-lens/src/codexlens/config.py","codex-lens/src/codexlens/env_config.py"],"intent":"codexlens-stage2-config","notes":"Benchmark matrix should exercise the three supported stage2 modes."} -{"query":"enable staged rerank stage 4 config","relevant_paths":["codex-lens/src/codexlens/config.py"],"intent":"codexlens-stage4-rerank","notes":"Stage 4 rerank flag needs to stay enabled for local benchmarks."} -{"query":"cascade_search dense_rerank staged pipeline ChainSearchEngine","relevant_paths":["codex-lens/src/codexlens/search/chain_search.py"],"intent":"chain-search-cascade","notes":"Baseline query for the central retrieval engine."} -{"query":"realtime LSP expand stage2 search pipeline","relevant_paths":["codex-lens/src/codexlens/search/chain_search.py"],"intent":"chain-search-stage2-realtime","notes":"Targets realtime stage2 expansion logic."} -{"query":"static global graph stage2 expansion implementation","relevant_paths":["codex-lens/src/codexlens/search/chain_search.py"],"intent":"chain-search-stage2-static","notes":"Targets static_global_graph stage2 expansion logic."} -{"query":"cross encoder rerank stage 4 implementation","relevant_paths":["codex-lens/src/codexlens/search/chain_search.py"],"intent":"chain-search-rerank","notes":"Relevant for dense_rerank and staged rerank latency comparisons."} -{"query":"get_reranker factory onnx backend selection","relevant_paths":["codex-lens/src/codexlens/semantic/reranker/factory.py"],"intent":"reranker-factory","notes":"Keeps the benchmark aligned with local ONNX reranker selection."} -{"query":"EMBEDDING_BACKEND and RERANKER_BACKEND environment variables","relevant_paths":["codex-lens/src/codexlens/env_config.py"],"intent":"env-overrides","notes":"Covers CCW/CodexLens local-only environment overrides."} diff --git a/codex-lens/benchmarks/accuracy_queries_codexlens.jsonl b/codex-lens/benchmarks/accuracy_queries_codexlens.jsonl deleted file mode 100644 index 18764bb0..00000000 --- a/codex-lens/benchmarks/accuracy_queries_codexlens.jsonl +++ /dev/null @@ -1,33 +0,0 @@ -{"query":"class StandaloneLspManager","relevant_paths":["codexlens/lsp/standalone_manager.py"]} -{"query":"def _open_document","relevant_paths":["codexlens/lsp/standalone_manager.py"]} -{"query":"def _read_message","relevant_paths":["codexlens/lsp/standalone_manager.py"]} -{"query":"how does textDocument/didOpen work","relevant_paths":["codexlens/lsp/standalone_manager.py"]} -{"query":"class LspBridge","relevant_paths":["codexlens/lsp/lsp_bridge.py"]} -{"query":"def get_document_symbols","relevant_paths":["codexlens/lsp/lsp_bridge.py"]} -{"query":"class KeepAliveLspBridge","relevant_paths":["codexlens/lsp/keepalive_bridge.py"]} -{"query":"LSP keepalive bridge","relevant_paths":["codexlens/lsp/keepalive_bridge.py"]} -{"query":"class LspGraphBuilder","relevant_paths":["codexlens/lsp/lsp_graph_builder.py"]} -{"query":"def build_from_seeds","relevant_paths":["codexlens/lsp/lsp_graph_builder.py"]} -{"query":"def _stage2_realtime_lsp_expand","relevant_paths":["codexlens/search/chain_search.py"]} -{"query":"def _stage3_cluster_prune","relevant_paths":["codexlens/search/chain_search.py"]} -{"query":"def _cross_encoder_rerank","relevant_paths":["codexlens/search/chain_search.py"]} -{"query":"def dense_rerank_cascade_search","relevant_paths":["codexlens/search/chain_search.py"]} -{"query":"def cascade_search","relevant_paths":["codexlens/search/chain_search.py"]} -{"query":"def _find_nearest_binary_mmap_root","relevant_paths":["codexlens/search/chain_search.py"]} -{"query":"class BinarySearcher","relevant_paths":["codexlens/search/binary_searcher.py"]} -{"query":"class GraphExpander","relevant_paths":["codexlens/search/graph_expander.py"]} -{"query":"def cross_encoder_rerank","relevant_paths":["codexlens/search/ranking.py"]} -{"query":"def group_similar_results","relevant_paths":["codexlens/search/ranking.py"]} -{"query":"class ConfigError","relevant_paths":["codexlens/errors.py"]} -{"query":"def load_settings","relevant_paths":["codexlens/config.py"]} -{"query":"BINARY_VECTORS_MMAP_NAME","relevant_paths":["codexlens/config.py"]} -{"query":"STAGED_CLUSTERING_STRATEGY","relevant_paths":["codexlens/config.py","codexlens/env_config.py"]} -{"query":"def apply_workspace_env","relevant_paths":["codexlens/env_config.py"]} -{"query":"def generate_env_example","relevant_paths":["codexlens/env_config.py"]} -{"query":"def get_reranker","relevant_paths":["codexlens/semantic/reranker/factory.py"]} -{"query":"class APIReranker","relevant_paths":["codexlens/semantic/reranker/api_reranker.py"]} -{"query":"class RegistryStore","relevant_paths":["codexlens/storage/registry.py"]} -{"query":"class PathMapper","relevant_paths":["codexlens/storage/path_mapper.py"]} -{"query":"def lsp_status","relevant_paths":["codexlens/cli/commands.py"]} -{"query":"graph_neighbors migration","relevant_paths":["codexlens/storage/migrations/migration_007_add_graph_neighbors.py"]} -{"query":"def get_model_config","relevant_paths":["codexlens/semantic/vector_store.py"]} diff --git a/codex-lens/benchmarks/analyze_methods.py b/codex-lens/benchmarks/analyze_methods.py deleted file mode 100644 index 9973d64c..00000000 --- a/codex-lens/benchmarks/analyze_methods.py +++ /dev/null @@ -1,245 +0,0 @@ -"""Analyze hybrid search methods contribution.""" -import json -import sqlite3 -import time -from pathlib import Path -from collections import defaultdict -import sys -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from codexlens.search.hybrid_search import HybridSearchEngine -from codexlens.search.ranking import ( - reciprocal_rank_fusion, - cross_encoder_rerank, - DEFAULT_WEIGHTS, -) - -# Use index with most data -index_path = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens\src\codexlens\storage\_index.db") - -print("=" * 60) -print("1. STORAGE ARCHITECTURE ANALYSIS") -print("=" * 60) - -# Analyze storage -with sqlite3.connect(index_path) as conn: - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name" - ) - tables = [row[0] for row in cursor.fetchall()] - - print("\nTable Overview:") - for table in tables: - try: - count = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0] - if count > 0: - print(f" {table}: {count} rows") - except: - pass - - print("\n--- Conflict Analysis ---") - - chunks_count = 0 - semantic_count = 0 - - if "chunks" in tables: - chunks_count = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0] - if "semantic_chunks" in tables: - semantic_count = conn.execute("SELECT COUNT(*) FROM semantic_chunks").fetchone()[0] - - print(f" chunks table: {chunks_count} rows") - print(f" semantic_chunks table: {semantic_count} rows") - - if semantic_count > 0: - col_info = conn.execute("PRAGMA table_info(semantic_chunks)").fetchall() - col_names = [c[1] for c in col_info] - - print(f"\n semantic_chunks columns: {col_names}") - - for col in ["embedding", "embedding_binary", "embedding_dense"]: - if col in col_names: - null_count = conn.execute( - f"SELECT COUNT(*) FROM semantic_chunks WHERE {col} IS NULL" - ).fetchone()[0] - non_null = semantic_count - null_count - print(f" {col}: {non_null}/{semantic_count} non-null") - -print("\n" + "=" * 60) -print("2. METHOD CONTRIBUTION ANALYSIS") -print("=" * 60) - -queries = [ - "database connection", - "create table", - "sqlite store", - "migration", - "search chunks", -] - -results_summary = { - "fts_exact": [], - "fts_fuzzy": [], - "vector": [], -} - -for query in queries: - print(f"\nQuery: '{query}'") - - # FTS Exact - try: - engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS) - engine._config = type("obj", (object,), { - "use_fts_fallback": True, - "embedding_use_gpu": True, - "symbol_boost_factor": 1.5, - "enable_reranking": False, - })() - - start = time.perf_counter() - results = engine.search(index_path, query, limit=10, enable_fuzzy=False, enable_vector=False) - latency = (time.perf_counter() - start) * 1000 - - results_summary["fts_exact"].append({"count": len(results), "latency": latency}) - top_file = results[0].path.split("\\")[-1] if results else "N/A" - top_score = results[0].score if results else 0 - print(f" FTS Exact: {len(results)} results, {latency:.1f}ms, top: {top_file} ({top_score:.3f})") - except Exception as e: - print(f" FTS Exact: ERROR - {e}") - - # FTS Fuzzy - try: - engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS) - engine._config = type("obj", (object,), { - "use_fts_fallback": True, - "embedding_use_gpu": True, - "symbol_boost_factor": 1.5, - "enable_reranking": False, - })() - - start = time.perf_counter() - results = engine.search(index_path, query, limit=10, enable_fuzzy=True, enable_vector=False) - latency = (time.perf_counter() - start) * 1000 - - results_summary["fts_fuzzy"].append({"count": len(results), "latency": latency}) - top_file = results[0].path.split("\\")[-1] if results else "N/A" - top_score = results[0].score if results else 0 - print(f" FTS Fuzzy: {len(results)} results, {latency:.1f}ms, top: {top_file} ({top_score:.3f})") - except Exception as e: - print(f" FTS Fuzzy: ERROR - {e}") - - # Vector - try: - engine = HybridSearchEngine() - engine._config = type("obj", (object,), { - "use_fts_fallback": False, - "embedding_use_gpu": True, - "symbol_boost_factor": 1.5, - "enable_reranking": False, - })() - - start = time.perf_counter() - results = engine.search(index_path, query, limit=10, enable_vector=True, pure_vector=True) - latency = (time.perf_counter() - start) * 1000 - - results_summary["vector"].append({"count": len(results), "latency": latency}) - top_file = results[0].path.split("\\")[-1] if results else "N/A" - top_score = results[0].score if results else 0 - print(f" Vector: {len(results)} results, {latency:.1f}ms, top: {top_file} ({top_score:.3f})") - except Exception as e: - print(f" Vector: ERROR - {e}") - -print("\n--- Summary ---") -for method, data in results_summary.items(): - if data: - avg_count = sum(d["count"] for d in data) / len(data) - avg_latency = sum(d["latency"] for d in data) / len(data) - print(f"{method}: avg {avg_count:.1f} results, {avg_latency:.1f}ms") - -print("\n" + "=" * 60) -print("3. FTS + RERANK FUSION EXPERIMENT") -print("=" * 60) - -# Initialize reranker -reranker = None -try: - from codexlens.semantic.reranker import get_reranker, check_reranker_available - ok, _ = check_reranker_available("onnx") - if ok: - reranker = get_reranker(backend="onnx", use_gpu=True) - print("\nReranker loaded: ONNX backend") -except Exception as e: - print(f"\nReranker unavailable: {e}") - -test_queries = ["database connection", "create table migration"] - -for query in test_queries: - print(f"\nQuery: '{query}'") - - # Strategy 1: Standard Hybrid (FTS exact+fuzzy RRF) - try: - engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS) - engine._config = type("obj", (object,), { - "use_fts_fallback": True, - "embedding_use_gpu": True, - "symbol_boost_factor": 1.5, - "enable_reranking": False, - })() - - start = time.perf_counter() - standard_results = engine.search(index_path, query, limit=10, enable_fuzzy=True, enable_vector=False) - standard_latency = (time.perf_counter() - start) * 1000 - - print(f" Standard FTS RRF: {len(standard_results)} results, {standard_latency:.1f}ms") - for i, r in enumerate(standard_results[:3]): - print(f" {i+1}. {r.path.split(chr(92))[-1]} (score: {r.score:.4f})") - except Exception as e: - print(f" Standard FTS RRF: ERROR - {e}") - standard_results = [] - - # Strategy 2: FTS + CrossEncoder Rerank - if reranker and standard_results: - try: - start = time.perf_counter() - reranked_results = cross_encoder_rerank(query, standard_results, reranker, top_k=10) - rerank_latency = (time.perf_counter() - start) * 1000 - - print(f" FTS + Rerank: {len(reranked_results)} results, {rerank_latency:.1f}ms (rerank only)") - for i, r in enumerate(reranked_results[:3]): - ce_score = r.metadata.get("cross_encoder_prob", r.score) - print(f" {i+1}. {r.path.split(chr(92))[-1]} (CE prob: {ce_score:.4f})") - - # Compare rankings - standard_order = [r.path.split("\\")[-1] for r in standard_results[:5]] - reranked_order = [r.path.split("\\")[-1] for r in reranked_results[:5]] - - if standard_order != reranked_order: - print(f" Ranking changed!") - print(f" Before: {standard_order}") - print(f" After: {reranked_order}") - else: - print(f" Ranking unchanged") - - except Exception as e: - print(f" FTS + Rerank: ERROR - {e}") - -print("\n" + "=" * 60) -print("CONCLUSIONS") -print("=" * 60) -print(""" -1. Storage Architecture: - - semantic_chunks: Used by cascade-index (binary+dense vectors) - - chunks: Used by legacy SQLiteStore (currently empty in this index) - - files_fts_*: Used by FTS exact/fuzzy search - - CONFLICT: binary_cascade_search reads from semantic_chunks, - but standard FTS reads from files table. These are SEPARATE paths. - -2. Method Contributions: - - FTS: Fast but limited to keyword matching - - Vector: Semantic understanding but requires embeddings - -3. FTS + Rerank Fusion: - - CrossEncoder reranking can improve precision - - Adds ~100-200ms latency per query - - Most effective when initial FTS recall is good -""") diff --git a/codex-lens/benchmarks/binary_search_microbenchmark.py b/codex-lens/benchmarks/binary_search_microbenchmark.py deleted file mode 100644 index d4bb8397..00000000 --- a/codex-lens/benchmarks/binary_search_microbenchmark.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python -"""Micro-benchmark for BinaryANNIndex search performance. - -Measures the actual speedup of vectorized Hamming distance computation. -""" - -from __future__ import annotations - -import gc -import statistics -import sys -import time -from pathlib import Path - -# Add src to path -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -import numpy as np - - -def old_search_implementation(query_arr: np.ndarray, vectors: dict, id_list: list, top_k: int): - """Original O(N) loop-based implementation for comparison.""" - packed_dim = len(query_arr) - distances = [] - - for vec_id in id_list: - vec = vectors[vec_id] - vec_arr = np.frombuffer(vec, dtype=np.uint8) - xor = np.bitwise_xor(query_arr, vec_arr) - dist = int(np.unpackbits(xor).sum()) - distances.append((vec_id, dist)) - - distances.sort(key=lambda x: x[1]) - top_results = distances[:top_k] - ids = [r[0] for r in top_results] - dists = [r[1] for r in top_results] - - return ids, dists - - -def new_search_implementation(query_arr: np.ndarray, vectors_matrix: np.ndarray, ids_array: np.ndarray, top_k: int): - """Optimized vectorized implementation.""" - # Broadcast XOR - xor_result = np.bitwise_xor(query_arr, vectors_matrix) - - # Vectorized popcount using lookup table - popcount_lut = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8) - bit_counts = popcount_lut[xor_result] - - # Sum across packed bytes - distances = bit_counts.sum(axis=1) - - # Get top-k using argpartition - n_vectors = len(distances) - k = min(top_k, n_vectors) - - if k == n_vectors: - sorted_indices = np.argsort(distances) - else: - partition_indices = np.argpartition(distances, k)[:k] - top_k_distances = distances[partition_indices] - sorted_order = np.argsort(top_k_distances) - sorted_indices = partition_indices[sorted_order] - - result_ids = ids_array[sorted_indices].tolist() - result_dists = distances[sorted_indices].tolist() - - return result_ids, result_dists - - -def run_benchmark(n_vectors: int, dim: int = 256, top_k: int = 100, n_iterations: int = 50): - """Run benchmark comparing old and new implementations.""" - packed_dim = dim // 8 # 32 bytes for 256-bit - - print(f"\n{'='*60}") - print(f"Binary Search Micro-Benchmark") - print(f"{'='*60}") - print(f"Vectors: {n_vectors}") - print(f"Dimension: {dim} bits ({packed_dim} bytes packed)") - print(f"Top-K: {top_k}") - print(f"Iterations: {n_iterations}") - print(f"{'='*60}\n") - - # Generate random binary vectors - print("Generating test data...") - vectors_dict = {} - id_list = [] - - for i in range(n_vectors): - vec_bytes = np.random.randint(0, 256, size=packed_dim, dtype=np.uint8).tobytes() - vectors_dict[i] = vec_bytes - id_list.append(i) - - # Build matrix for vectorized search - vectors_matrix = np.empty((n_vectors, packed_dim), dtype=np.uint8) - ids_array = np.array(id_list, dtype=np.int64) - - for i, vec_id in enumerate(id_list): - vec_bytes = vectors_dict[vec_id] - vectors_matrix[i] = np.frombuffer(vec_bytes, dtype=np.uint8) - - # Generate random query - query_bytes = np.random.randint(0, 256, size=packed_dim, dtype=np.uint8).tobytes() - query_arr = np.frombuffer(query_bytes, dtype=np.uint8) - - # Warmup - print("Running warmup...") - for _ in range(3): - old_search_implementation(query_arr, vectors_dict, id_list, top_k) - new_search_implementation(query_arr, vectors_matrix, ids_array, top_k) - - # Benchmark old implementation - print("Benchmarking old implementation...") - old_times = [] - for _ in range(n_iterations): - gc.collect() - start = time.perf_counter() - old_ids, old_dists = old_search_implementation(query_arr, vectors_dict, id_list, top_k) - elapsed = (time.perf_counter() - start) * 1000 - old_times.append(elapsed) - - # Benchmark new implementation - print("Benchmarking new implementation...") - new_times = [] - for _ in range(n_iterations): - gc.collect() - start = time.perf_counter() - new_ids, new_dists = new_search_implementation(query_arr, vectors_matrix, ids_array, top_k) - elapsed = (time.perf_counter() - start) * 1000 - new_times.append(elapsed) - - # Verify correctness - print("\nVerifying correctness...") - # Check that distances are correct (IDs may differ for ties) - if old_dists == new_dists: - print("Distances match! (IDs may differ for ties)") - else: - # Check if difference is just in tie-breaking - old_dist_set = set(old_dists) - new_dist_set = set(new_dists) - if old_dist_set == new_dist_set: - print("Distances equivalent (tie-breaking differs, which is acceptable)") - else: - print("WARNING: Distance distributions differ!") - print(f" Old dists (first 5): {old_dists[:5]}") - print(f" New dists (first 5): {new_dists[:5]}") - - # Calculate statistics - old_avg = statistics.mean(old_times) - old_std = statistics.stdev(old_times) if len(old_times) > 1 else 0 - new_avg = statistics.mean(new_times) - new_std = statistics.stdev(new_times) if len(new_times) > 1 else 0 - - speedup = old_avg / new_avg if new_avg > 0 else 0 - - # Print results - print(f"\n{'='*60}") - print("RESULTS") - print(f"{'='*60}") - print(f"{'Metric':<25} {'Old (loop)':>15} {'New (vectorized)':>18}") - print(f"{'-'*25} {'-'*15} {'-'*18}") - print(f"{'Avg Latency (ms)':<25} {old_avg:>15.3f} {new_avg:>18.3f}") - print(f"{'Std Dev (ms)':<25} {old_std:>15.3f} {new_std:>18.3f}") - print(f"{'Min Latency (ms)':<25} {min(old_times):>15.3f} {min(new_times):>18.3f}") - print(f"{'Max Latency (ms)':<25} {max(old_times):>15.3f} {max(new_times):>18.3f}") - print(f"{'P50 (ms)':<25} {sorted(old_times)[len(old_times)//2]:>15.3f} {sorted(new_times)[len(new_times)//2]:>18.3f}") - print(f"\n{'Speedup:':<25} {speedup:>15.2f}x") - print(f"{'='*60}\n") - - return { - "n_vectors": n_vectors, - "dim": dim, - "top_k": top_k, - "old_avg_ms": old_avg, - "new_avg_ms": new_avg, - "speedup": speedup, - } - - -def main(): - print("\n" + "="*70) - print(" BINARY SEARCH OPTIMIZATION MICRO-BENCHMARK") - print("="*70) - - # Test different vector counts - results = [] - - for n_vectors in [1000, 5000, 10000, 50000]: - result = run_benchmark( - n_vectors=n_vectors, - dim=256, - top_k=100, - n_iterations=20, - ) - results.append(result) - - # Summary - print("\n" + "="*70) - print(" SUMMARY") - print("="*70) - print(f"{'N Vectors':<12} {'Old (ms)':<12} {'New (ms)':<12} {'Speedup':>10}") - print("-"*50) - for r in results: - print(f"{r['n_vectors']:<12} {r['old_avg_ms']:<12.3f} {r['new_avg_ms']:<12.3f} {r['speedup']:>10.2f}x") - print("="*70) - - -if __name__ == "__main__": - main() diff --git a/codex-lens/benchmarks/cascade_benchmark.py b/codex-lens/benchmarks/cascade_benchmark.py deleted file mode 100644 index 90abfda1..00000000 --- a/codex-lens/benchmarks/cascade_benchmark.py +++ /dev/null @@ -1,402 +0,0 @@ -#!/usr/bin/env python -"""Benchmark script for comparing cascade search strategies. - -Compares: -- binary: 256-dim binary coarse ranking + 2048-dim dense fine ranking -- hybrid: FTS+Vector coarse ranking + CrossEncoder fine ranking - -Usage: - python benchmarks/cascade_benchmark.py [--source PATH] [--queries N] [--warmup N] -""" - -from __future__ import annotations - -import argparse -import gc -import json -import os -import statistics -import sys -import time -import traceback -from dataclasses import dataclass, asdict -from pathlib import Path -from typing import List, Optional, Dict, Any - -# Add src to path -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from codexlens.search.chain_search import ChainSearchEngine, SearchOptions -from codexlens.config import Config -from codexlens.storage.registry import RegistryStore -from codexlens.storage.path_mapper import PathMapper - - -@dataclass -class BenchmarkResult: - """Result from a single benchmark run.""" - strategy: str - query: str - latency_ms: float - num_results: int - top_result: Optional[str] - error: Optional[str] = None - - -@dataclass -class BenchmarkSummary: - """Aggregated benchmark statistics.""" - strategy: str - total_queries: int - successful_queries: int - avg_latency_ms: float - min_latency_ms: float - max_latency_ms: float - p50_latency_ms: float - p95_latency_ms: float - p99_latency_ms: float - avg_results: float - errors: List[str] - - -# Default test queries covering different scenarios -DEFAULT_QUERIES = [ - # Code patterns - "def search", - "class Engine", - "import numpy", - "async def", - "raise ValueError", - # Semantic queries - "how to parse json", - "database connection", - "error handling", - "authentication logic", - "file read write", - # Technical terms - "embedding vector", - "cosine similarity", - "binary quantization", - "hamming distance", - "reranking", -] - - -def percentile(data: List[float], p: float) -> float: - """Calculate percentile of sorted data.""" - if not data: - return 0.0 - sorted_data = sorted(data) - k = (len(sorted_data) - 1) * (p / 100) - f = int(k) - c = f + 1 if f + 1 < len(sorted_data) else f - return sorted_data[f] + (k - f) * (sorted_data[c] - sorted_data[f]) - - -def run_single_benchmark( - engine: ChainSearchEngine, - query: str, - source_path: Path, - strategy: str, - options: Optional[SearchOptions] = None, -) -> BenchmarkResult: - """Run a single benchmark query.""" - gc.collect() - - start_time = time.perf_counter() - try: - result = engine.cascade_search( - query=query, - source_path=source_path, - k=10, - coarse_k=100, - options=options, - strategy=strategy, - ) - elapsed_ms = (time.perf_counter() - start_time) * 1000 - - top_result = None - if result.results: - r = result.results[0] - line = r.start_line or 0 - top_result = f"{r.path}:{line}" - - return BenchmarkResult( - strategy=strategy, - query=query, - latency_ms=elapsed_ms, - num_results=len(result.results), - top_result=top_result, - ) - except Exception as e: - elapsed_ms = (time.perf_counter() - start_time) * 1000 - return BenchmarkResult( - strategy=strategy, - query=query, - latency_ms=elapsed_ms, - num_results=0, - top_result=None, - error=str(e), - ) - - -def run_benchmarks( - source_path: Path, - queries: List[str], - strategies: List[str], - warmup_runs: int = 2, - options: Optional[SearchOptions] = None, -) -> Dict[str, List[BenchmarkResult]]: - """Run benchmarks for all queries and strategies.""" - - print(f"\n{'='*60}") - print(f"Cascade Search Benchmark") - print(f"{'='*60}") - print(f"Source: {source_path}") - print(f"Queries: {len(queries)}") - print(f"Strategies: {strategies}") - print(f"Warmup runs: {warmup_runs}") - print(f"{'='*60}\n") - - # Initialize engine - config = Config() - registry = RegistryStore() # Uses default path - registry.initialize() - mapper = PathMapper() # Uses default path - engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config) - - results: Dict[str, List[BenchmarkResult]] = {s: [] for s in strategies} - - # Warmup phase - if warmup_runs > 0: - print(f"Running {warmup_runs} warmup queries...") - warmup_query = queries[0] if queries else "test" - for strategy in strategies: - for _ in range(warmup_runs): - try: - run_single_benchmark(engine, warmup_query, source_path, strategy, options) - except Exception: - pass - print("Warmup complete.\n") - - # Benchmark phase - total_runs = len(queries) * len(strategies) - current_run = 0 - - for query in queries: - for strategy in strategies: - current_run += 1 - print(f"[{current_run}/{total_runs}] {strategy}: '{query[:40]}...' ", end="", flush=True) - - result = run_single_benchmark(engine, query, source_path, strategy, options) - results[strategy].append(result) - - if result.error: - print(f"ERROR: {result.error[:50]}") - else: - print(f"{result.latency_ms:.1f}ms, {result.num_results} results") - - return results - - -def summarize_results(results: Dict[str, List[BenchmarkResult]]) -> Dict[str, BenchmarkSummary]: - """Generate summary statistics for each strategy.""" - summaries = {} - - for strategy, benchmark_results in results.items(): - latencies = [r.latency_ms for r in benchmark_results if r.error is None] - result_counts = [r.num_results for r in benchmark_results if r.error is None] - errors = [r.error for r in benchmark_results if r.error is not None] - - if latencies: - summary = BenchmarkSummary( - strategy=strategy, - total_queries=len(benchmark_results), - successful_queries=len(latencies), - avg_latency_ms=statistics.mean(latencies), - min_latency_ms=min(latencies), - max_latency_ms=max(latencies), - p50_latency_ms=percentile(latencies, 50), - p95_latency_ms=percentile(latencies, 95), - p99_latency_ms=percentile(latencies, 99), - avg_results=statistics.mean(result_counts) if result_counts else 0, - errors=errors, - ) - else: - summary = BenchmarkSummary( - strategy=strategy, - total_queries=len(benchmark_results), - successful_queries=0, - avg_latency_ms=0, - min_latency_ms=0, - max_latency_ms=0, - p50_latency_ms=0, - p95_latency_ms=0, - p99_latency_ms=0, - avg_results=0, - errors=errors, - ) - - summaries[strategy] = summary - - return summaries - - -def print_comparison_table(summaries: Dict[str, BenchmarkSummary]) -> None: - """Print formatted comparison table.""" - print(f"\n{'='*80}") - print("BENCHMARK RESULTS COMPARISON") - print(f"{'='*80}\n") - - # Header - print(f"{'Metric':<25} {'Binary':>15} {'Hybrid':>15} {'Diff':>15}") - print(f"{'-'*25} {'-'*15} {'-'*15} {'-'*15}") - - binary = summaries.get("binary") - hybrid = summaries.get("hybrid") - - if not binary or not hybrid: - print("Missing results for comparison") - return - - metrics = [ - ("Total Queries", binary.total_queries, hybrid.total_queries), - ("Successful", binary.successful_queries, hybrid.successful_queries), - ("Avg Latency (ms)", binary.avg_latency_ms, hybrid.avg_latency_ms), - ("Min Latency (ms)", binary.min_latency_ms, hybrid.min_latency_ms), - ("Max Latency (ms)", binary.max_latency_ms, hybrid.max_latency_ms), - ("P50 Latency (ms)", binary.p50_latency_ms, hybrid.p50_latency_ms), - ("P95 Latency (ms)", binary.p95_latency_ms, hybrid.p95_latency_ms), - ("P99 Latency (ms)", binary.p99_latency_ms, hybrid.p99_latency_ms), - ("Avg Results", binary.avg_results, hybrid.avg_results), - ] - - for name, b_val, h_val in metrics: - if isinstance(b_val, float): - diff = b_val - h_val - diff_str = f"{diff:+.2f}" if diff != 0 else "0.00" - speedup = h_val / b_val if b_val > 0 else 0 - if "Latency" in name and speedup > 1: - diff_str += f" ({speedup:.1f}x faster)" - print(f"{name:<25} {b_val:>15.2f} {h_val:>15.2f} {diff_str:>15}") - else: - diff = b_val - h_val - print(f"{name:<25} {b_val:>15} {h_val:>15} {diff:>+15}") - - # Errors - print(f"\n{'Errors:':<25}") - print(f" Binary: {len(binary.errors)}") - for err in binary.errors[:3]: - print(f" - {err[:60]}...") - print(f" Hybrid: {len(hybrid.errors)}") - for err in hybrid.errors[:3]: - print(f" - {err[:60]}...") - - # Winner - print(f"\n{'='*80}") - if binary.avg_latency_ms < hybrid.avg_latency_ms and binary.successful_queries > 0: - speedup = hybrid.avg_latency_ms / binary.avg_latency_ms - print(f"[WINNER] Binary ({speedup:.2f}x faster average latency)") - elif hybrid.avg_latency_ms < binary.avg_latency_ms and hybrid.successful_queries > 0: - speedup = binary.avg_latency_ms / hybrid.avg_latency_ms - print(f"[WINNER] Hybrid ({speedup:.2f}x faster average latency)") - else: - print("No clear winner (check errors)") - print(f"{'='*80}\n") - - -def save_results( - results: Dict[str, List[BenchmarkResult]], - summaries: Dict[str, BenchmarkSummary], - output_path: Path, -) -> None: - """Save benchmark results to JSON file.""" - data = { - "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), - "summaries": {k: asdict(v) for k, v in summaries.items()}, - "details": { - k: [asdict(r) for r in v] - for k, v in results.items() - }, - } - - output_path.parent.mkdir(parents=True, exist_ok=True) - with open(output_path, "w", encoding="utf-8") as f: - json.dump(data, f, indent=2) - - print(f"Results saved to: {output_path}") - - -def main(): - parser = argparse.ArgumentParser(description="Benchmark cascade search strategies") - parser.add_argument( - "--source", "-s", - type=Path, - default=Path(__file__).parent.parent / "src", - help="Source directory to search (default: ./src)", - ) - parser.add_argument( - "--queries", "-q", - type=int, - default=len(DEFAULT_QUERIES), - help=f"Number of queries to run (default: {len(DEFAULT_QUERIES)})", - ) - parser.add_argument( - "--warmup", "-w", - type=int, - default=2, - help="Number of warmup runs (default: 2)", - ) - parser.add_argument( - "--output", "-o", - type=Path, - default=Path(__file__).parent / "results" / "cascade_benchmark.json", - help="Output file for results (default: benchmarks/results/cascade_benchmark.json)", - ) - parser.add_argument( - "--strategies", - nargs="+", - default=["binary", "hybrid"], - choices=["binary", "hybrid"], - help="Strategies to benchmark (default: both)", - ) - - args = parser.parse_args() - - # Validate source path - if not args.source.exists(): - print(f"Error: Source path does not exist: {args.source}") - sys.exit(1) - - # Select queries - queries = DEFAULT_QUERIES[:args.queries] - - # Run benchmarks - try: - results = run_benchmarks( - source_path=args.source, - queries=queries, - strategies=args.strategies, - warmup_runs=args.warmup, - ) - - # Generate summaries - summaries = summarize_results(results) - - # Print comparison - print_comparison_table(summaries) - - # Save results - save_results(results, summaries, args.output) - - except KeyboardInterrupt: - print("\nBenchmark interrupted.") - sys.exit(1) - except Exception as e: - print(f"\nBenchmark failed: {e}") - traceback.print_exc() - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/codex-lens/benchmarks/compare_accuracy_labeled.py b/codex-lens/benchmarks/compare_accuracy_labeled.py deleted file mode 100644 index 7000a181..00000000 --- a/codex-lens/benchmarks/compare_accuracy_labeled.py +++ /dev/null @@ -1,365 +0,0 @@ -#!/usr/bin/env python -"""Compare labeled accuracy: staged(realtime LSP graph) vs dense_rerank. - -This script measures retrieval "accuracy" against a labeled query set. -Each query must provide a list of relevant file paths (relative to --source -or absolute). We report: - - Hit@K (any relevant file appears in top-K) - - MRR@K (reciprocal rank of first relevant file within top-K) - - Recall@K (fraction of relevant files present in top-K) - -Example: - python benchmarks/compare_accuracy_labeled.py --source ./src - python benchmarks/compare_accuracy_labeled.py --queries-file benchmarks/accuracy_queries_codexlens.jsonl -""" - -from __future__ import annotations - -import argparse -import gc -import json -import os -import re -import statistics -import sys -import time -from dataclasses import asdict, dataclass -from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple - -# Add src to path (match other benchmark scripts) -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from codexlens.config import Config -from codexlens.search.chain_search import ChainSearchEngine, SearchOptions -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - - -DEFAULT_QUERIES_FILE = Path(__file__).parent / "accuracy_queries_codexlens.jsonl" - - -def _now_ms() -> float: - return time.perf_counter() * 1000.0 - - -def _normalize_path_key(path: str) -> str: - """Normalize file paths for overlap/dedup metrics (Windows-safe).""" - try: - p = Path(path) - # Don't explode on non-files like "". - if str(p) and (p.is_absolute() or re.match(r"^[A-Za-z]:", str(p))): - norm = str(p.resolve()) - else: - norm = str(p) - except Exception: - norm = path - norm = norm.replace("/", "\\") - if os.name == "nt": - norm = norm.lower() - return norm - - -def _load_labeled_queries(path: Path, limit: Optional[int]) -> List[Dict[str, Any]]: - if not path.is_file(): - raise SystemExit(f"Queries file does not exist: {path}") - - out: List[Dict[str, Any]] = [] - for raw_line in path.read_text(encoding="utf-8", errors="ignore").splitlines(): - line = raw_line.strip() - if not line or line.startswith("#"): - continue - try: - item = json.loads(line) - except Exception as exc: - raise SystemExit(f"Invalid JSONL line in {path}: {raw_line!r} ({exc})") from exc - if not isinstance(item, dict) or "query" not in item: - raise SystemExit(f"Invalid query item (expected object with 'query'): {item!r}") - out.append(item) - if limit is not None and len(out) >= limit: - break - return out - - -def _dedup_topk(paths: Iterable[str], k: int) -> List[str]: - out: List[str] = [] - seen: set[str] = set() - for p in paths: - if p in seen: - continue - seen.add(p) - out.append(p) - if len(out) >= k: - break - return out - - -def _first_hit_rank(topk_paths: Sequence[str], relevant: set[str]) -> Optional[int]: - for i, p in enumerate(topk_paths, start=1): - if p in relevant: - return i - return None - - -@dataclass -class StrategyRun: - strategy: str - latency_ms: float - topk_paths: List[str] - first_hit_rank: Optional[int] - hit_at_k: bool - recall_at_k: float - error: Optional[str] = None - - -@dataclass -class QueryEval: - query: str - relevant_paths: List[str] - staged: StrategyRun - dense_rerank: StrategyRun - - -def _run_strategy( - engine: ChainSearchEngine, - *, - strategy: str, - query: str, - source_path: Path, - k: int, - coarse_k: int, - relevant: set[str], - options: Optional[SearchOptions] = None, -) -> StrategyRun: - gc.collect() - start_ms = _now_ms() - try: - result = engine.cascade_search( - query=query, - source_path=source_path, - k=k, - coarse_k=coarse_k, - options=options, - strategy=strategy, - ) - latency_ms = _now_ms() - start_ms - paths_raw = [r.path for r in (result.results or []) if getattr(r, "path", None)] - paths_norm = [_normalize_path_key(p) for p in paths_raw] - topk = _dedup_topk(paths_norm, k=k) - rank = _first_hit_rank(topk, relevant) - hit = rank is not None - recall = 0.0 - if relevant: - recall = len(set(topk) & relevant) / float(len(relevant)) - return StrategyRun( - strategy=strategy, - latency_ms=latency_ms, - topk_paths=topk, - first_hit_rank=rank, - hit_at_k=hit, - recall_at_k=recall, - error=None, - ) - except Exception as exc: - latency_ms = _now_ms() - start_ms - return StrategyRun( - strategy=strategy, - latency_ms=latency_ms, - topk_paths=[], - first_hit_rank=None, - hit_at_k=False, - recall_at_k=0.0, - error=repr(exc), - ) - - -def _mrr(ranks: Sequence[Optional[int]]) -> float: - vals = [] - for r in ranks: - if r is None or r <= 0: - vals.append(0.0) - else: - vals.append(1.0 / float(r)) - return statistics.mean(vals) if vals else 0.0 - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Compare labeled retrieval accuracy: staged(realtime) vs dense_rerank" - ) - parser.add_argument( - "--source", - type=Path, - default=Path(__file__).parent.parent / "src", - help="Source directory to search (default: ./src)", - ) - parser.add_argument( - "--queries-file", - type=Path, - default=DEFAULT_QUERIES_FILE, - help="JSONL file with {query, relevant_paths[]} per line", - ) - parser.add_argument("--queries", type=int, default=None, help="Limit number of queries") - parser.add_argument("--k", type=int, default=10, help="Top-K for evaluation (default 10)") - parser.add_argument("--coarse-k", type=int, default=100, help="Coarse candidates (default 100)") - parser.add_argument( - "--staged-cluster-strategy", - type=str, - default="path", - help="Config.staged_clustering_strategy override for staged (default: path)", - ) - parser.add_argument( - "--stage2-mode", - type=str, - default="realtime", - help="Config.staged_stage2_mode override for staged (default: realtime)", - ) - parser.add_argument( - "--output", - type=Path, - default=Path(__file__).parent / "results" / "accuracy_labeled.json", - help="Output JSON path", - ) - args = parser.parse_args() - - if not args.source.exists(): - raise SystemExit(f"Source path does not exist: {args.source}") - - labeled = _load_labeled_queries(args.queries_file, args.queries) - if not labeled: - raise SystemExit("No queries to run") - - source_root = args.source.expanduser().resolve() - - # Match CLI behavior: load settings + apply global/workspace .env overrides. - config = Config.load() - config.cascade_strategy = "staged" - config.staged_stage2_mode = str(args.stage2_mode or "realtime").strip().lower() - config.enable_staged_rerank = True - config.staged_clustering_strategy = str(args.staged_cluster_strategy or "path").strip().lower() - # Stability: on some Windows setups, DirectML/ONNX can crash under load. - config.embedding_use_gpu = False - config.reranker_use_gpu = False - - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config) - - def resolve_expected(paths: Sequence[str]) -> set[str]: - out: set[str] = set() - for p in paths: - try: - cand = Path(p) - if not cand.is_absolute(): - cand = (source_root / cand).resolve() - out.add(_normalize_path_key(str(cand))) - except Exception: - out.add(_normalize_path_key(p)) - return out - - evaluations: List[QueryEval] = [] - - try: - for i, item in enumerate(labeled, start=1): - query = str(item.get("query", "")).strip() - relevant_raw = item.get("relevant_paths") or [] - if not query: - continue - if not isinstance(relevant_raw, list) or not relevant_raw: - raise SystemExit(f"Query item missing relevant_paths[]: {item!r}") - relevant = resolve_expected([str(p) for p in relevant_raw]) - - print(f"[{i}/{len(labeled)}] {query}") - - staged = _run_strategy( - engine, - strategy="staged", - query=query, - source_path=source_root, - k=int(args.k), - coarse_k=int(args.coarse_k), - relevant=relevant, - options=None, - ) - dense = _run_strategy( - engine, - strategy="dense_rerank", - query=query, - source_path=source_root, - k=int(args.k), - coarse_k=int(args.coarse_k), - relevant=relevant, - options=None, - ) - - evaluations.append( - QueryEval( - query=query, - relevant_paths=[_normalize_path_key(str((source_root / p).resolve())) if not Path(p).is_absolute() else _normalize_path_key(p) for p in relevant_raw], - staged=staged, - dense_rerank=dense, - ) - ) - finally: - try: - engine.close() - except Exception: - pass - try: - registry.close() - except Exception: - pass - - staged_runs = [e.staged for e in evaluations] - dense_runs = [e.dense_rerank for e in evaluations] - - def mean(xs: Sequence[float]) -> float: - return statistics.mean(xs) if xs else 0.0 - - staged_ranks = [r.first_hit_rank for r in staged_runs] - dense_ranks = [r.first_hit_rank for r in dense_runs] - - summary = { - "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), - "source": str(source_root), - "queries_file": str(args.queries_file), - "query_count": len(evaluations), - "k": int(args.k), - "coarse_k": int(args.coarse_k), - "staged": { - "hit_at_k": mean([1.0 if r.hit_at_k else 0.0 for r in staged_runs]), - "mrr_at_k": _mrr(staged_ranks), - "avg_recall_at_k": mean([r.recall_at_k for r in staged_runs]), - "avg_latency_ms": mean([r.latency_ms for r in staged_runs if not r.error]), - "errors": sum(1 for r in staged_runs if r.error), - }, - "dense_rerank": { - "hit_at_k": mean([1.0 if r.hit_at_k else 0.0 for r in dense_runs]), - "mrr_at_k": _mrr(dense_ranks), - "avg_recall_at_k": mean([r.recall_at_k for r in dense_runs]), - "avg_latency_ms": mean([r.latency_ms for r in dense_runs if not r.error]), - "errors": sum(1 for r in dense_runs if r.error), - }, - "config": { - "staged_stage2_mode": config.staged_stage2_mode, - "staged_clustering_strategy": config.staged_clustering_strategy, - "enable_staged_rerank": bool(config.enable_staged_rerank), - "reranker_backend": config.reranker_backend, - "reranker_model": config.reranker_model, - "embedding_backend": config.embedding_backend, - "embedding_model": config.embedding_model, - }, - } - - payload = {"summary": summary, "evaluations": [asdict(e) for e in evaluations]} - args.output.parent.mkdir(parents=True, exist_ok=True) - args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8") - - print("\n=== SUMMARY ===") - print(json.dumps(summary, indent=2)) - print(f"\nSaved: {args.output}") - - -if __name__ == "__main__": - main() diff --git a/codex-lens/benchmarks/compare_ccw_smart_search_stage2.py b/codex-lens/benchmarks/compare_ccw_smart_search_stage2.py deleted file mode 100644 index b6776bfd..00000000 --- a/codex-lens/benchmarks/compare_ccw_smart_search_stage2.py +++ /dev/null @@ -1,980 +0,0 @@ -#!/usr/bin/env python -"""Benchmark local-only staged stage2 modes for CCW smart_search queries. - -This benchmark reuses the existing CodexLens benchmark style, but focuses on -the real search intents that drive CCW `smart_search`. It evaluates: - -1. `dense_rerank` baseline -2. `staged` + `precomputed` -3. `staged` + `realtime` -4. `staged` + `static_global_graph` - -Metrics: - - Hit@K - - MRR@K - - Recall@K - - latency (avg/p50/p95) - -The runner is intentionally local-only. By default it uses: - - embedding backend: `fastembed` - - reranker backend: `onnx` - -Examples: - python benchmarks/compare_ccw_smart_search_stage2.py --dry-run - python benchmarks/compare_ccw_smart_search_stage2.py --self-check - python benchmarks/compare_ccw_smart_search_stage2.py --source .. --k 10 - python benchmarks/compare_ccw_smart_search_stage2.py --embedding-model code --reranker-model cross-encoder/ms-marco-MiniLM-L-6-v2 -""" - -from __future__ import annotations - -import argparse -from copy import deepcopy -import gc -import json -import os -import re -import statistics -import sys -import time -from dataclasses import asdict, dataclass -from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple - -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from codexlens.config import Config -from codexlens.search.chain_search import ChainSearchEngine, SearchOptions -from codexlens.search.ranking import ( - QueryIntent, - detect_query_intent, - is_generated_artifact_path, - is_test_file, - query_prefers_lexical_search, - query_targets_generated_files, -) -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - - -DEFAULT_SOURCE = Path(__file__).resolve().parents[2] -DEFAULT_QUERIES_FILE = Path(__file__).parent / "accuracy_queries_ccw_smart_search.jsonl" -DEFAULT_OUTPUT = Path(__file__).parent / "results" / "ccw_smart_search_stage2.json" - -VALID_STAGE2_MODES = ("precomputed", "realtime", "static_global_graph") -VALID_LOCAL_EMBEDDING_BACKENDS = ("fastembed",) -VALID_LOCAL_RERANKER_BACKENDS = ("onnx", "fastembed", "legacy") -VALID_BASELINE_METHODS = ("auto", "fts", "hybrid") -DEFAULT_LOCAL_ONNX_RERANKER_MODEL = "Xenova/ms-marco-MiniLM-L-6-v2" - - -def _now_ms() -> float: - return time.perf_counter() * 1000.0 - - -def _normalize_path_key(path: str) -> str: - try: - candidate = Path(path) - if str(candidate) and (candidate.is_absolute() or re.match(r"^[A-Za-z]:", str(candidate))): - normalized = str(candidate.resolve()) - else: - normalized = str(candidate) - except Exception: - normalized = path - normalized = normalized.replace("/", "\\") - if os.name == "nt": - normalized = normalized.lower() - return normalized - - -def _dedup_topk(paths: Iterable[str], k: int) -> List[str]: - output: List[str] = [] - seen: set[str] = set() - for path in paths: - if path in seen: - continue - seen.add(path) - output.append(path) - if len(output) >= k: - break - return output - - -def _first_hit_rank(topk_paths: Sequence[str], relevant: set[str]) -> Optional[int]: - for index, path in enumerate(topk_paths, start=1): - if path in relevant: - return index - return None - - -def _mrr(ranks: Sequence[Optional[int]]) -> float: - values = [1.0 / rank for rank in ranks if rank and rank > 0] - return statistics.mean(values) if values else 0.0 - - -def _mean(values: Sequence[float]) -> float: - return statistics.mean(values) if values else 0.0 - - -def _percentile(values: Sequence[float], percentile: float) -> float: - if not values: - return 0.0 - ordered = sorted(values) - if len(ordered) == 1: - return ordered[0] - index = (len(ordered) - 1) * percentile - lower = int(index) - upper = min(lower + 1, len(ordered) - 1) - if lower == upper: - return ordered[lower] - fraction = index - lower - return ordered[lower] + (ordered[upper] - ordered[lower]) * fraction - - -def _load_labeled_queries(path: Path, limit: Optional[int]) -> List[Dict[str, Any]]: - if not path.is_file(): - raise SystemExit(f"Queries file does not exist: {path}") - - output: List[Dict[str, Any]] = [] - for raw_line in path.read_text(encoding="utf-8", errors="ignore").splitlines(): - line = raw_line.strip() - if not line or line.startswith("#"): - continue - try: - item = json.loads(line) - except Exception as exc: - raise SystemExit(f"Invalid JSONL line in {path}: {raw_line!r} ({exc})") from exc - if not isinstance(item, dict) or "query" not in item or "relevant_paths" not in item: - raise SystemExit(f"Invalid query item (expected object with query/relevant_paths): {item!r}") - relevant_paths = item.get("relevant_paths") - if not isinstance(relevant_paths, list) or not relevant_paths: - raise SystemExit(f"Query item must include non-empty relevant_paths[]: {item!r}") - output.append(item) - if limit is not None and len(output) >= limit: - break - return output - - -def _resolve_expected_paths(source_root: Path, paths: Sequence[str]) -> Tuple[List[str], set[str], List[str]]: - resolved_display: List[str] = [] - resolved_keys: set[str] = set() - missing: List[str] = [] - - for raw_path in paths: - candidate = Path(raw_path) - if not candidate.is_absolute(): - candidate = (source_root / candidate).resolve() - if not candidate.exists(): - missing.append(str(candidate)) - resolved_display.append(str(candidate)) - resolved_keys.add(_normalize_path_key(str(candidate))) - return resolved_display, resolved_keys, missing - - -def _validate_local_only_backends(embedding_backend: str, reranker_backend: str) -> None: - if embedding_backend not in VALID_LOCAL_EMBEDDING_BACKENDS: - raise SystemExit( - "This runner is local-only. " - f"--embedding-backend must be one of {', '.join(VALID_LOCAL_EMBEDDING_BACKENDS)}; got {embedding_backend!r}" - ) - if reranker_backend not in VALID_LOCAL_RERANKER_BACKENDS: - raise SystemExit( - "This runner is local-only. " - f"--reranker-backend must be one of {', '.join(VALID_LOCAL_RERANKER_BACKENDS)}; got {reranker_backend!r}" - ) - - -def _validate_stage2_modes(stage2_modes: Sequence[str]) -> List[str]: - normalized = [str(mode).strip().lower() for mode in stage2_modes if str(mode).strip()] - if not normalized: - raise SystemExit("At least one --stage2-modes entry is required") - invalid = [mode for mode in normalized if mode not in VALID_STAGE2_MODES] - if invalid: - raise SystemExit( - f"Invalid --stage2-modes entry: {invalid[0]} " - f"(valid: {', '.join(VALID_STAGE2_MODES)})" - ) - deduped: List[str] = [] - seen: set[str] = set() - for mode in normalized: - if mode in seen: - continue - seen.add(mode) - deduped.append(mode) - return deduped - - -def _validate_baseline_methods(methods: Sequence[str]) -> List[str]: - normalized = [str(method).strip().lower() for method in methods if str(method).strip()] - invalid = [method for method in normalized if method not in VALID_BASELINE_METHODS] - if invalid: - raise SystemExit( - f"Invalid --baseline-methods entry: {invalid[0]} " - f"(valid: {', '.join(VALID_BASELINE_METHODS)})" - ) - deduped: List[str] = [] - seen: set[str] = set() - for method in normalized: - if method in seen: - continue - seen.add(method) - deduped.append(method) - return deduped - - -@dataclass -class StrategyRun: - strategy_key: str - strategy: str - stage2_mode: Optional[str] - effective_method: str - execution_method: str - latency_ms: float - topk_paths: List[str] - first_hit_rank: Optional[int] - hit_at_k: bool - recall_at_k: float - generated_artifact_count: int - test_file_count: int - error: Optional[str] = None - - -@dataclass -class QueryEvaluation: - query: str - intent: Optional[str] - notes: Optional[str] - relevant_paths: List[str] - runs: Dict[str, StrategyRun] - - -@dataclass -class PairwiseDelta: - mode_a: str - mode_b: str - hit_at_k_delta: float - mrr_at_k_delta: float - avg_recall_at_k_delta: float - avg_latency_ms_delta: float - - -@dataclass -class StrategySpec: - strategy_key: str - strategy: str - stage2_mode: Optional[str] - - -@dataclass -class StrategyRuntime: - strategy_spec: StrategySpec - config: Config - registry: RegistryStore - engine: ChainSearchEngine - - -def _strategy_specs( - stage2_modes: Sequence[str], - include_dense_baseline: bool, - *, - baseline_methods: Sequence[str], -) -> List[StrategySpec]: - specs: List[StrategySpec] = [] - for method in baseline_methods: - specs.append(StrategySpec(strategy_key=method, strategy=method, stage2_mode=None)) - if include_dense_baseline: - specs.append(StrategySpec(strategy_key="dense_rerank", strategy="dense_rerank", stage2_mode=None)) - for stage2_mode in stage2_modes: - specs.append( - StrategySpec( - strategy_key=f"staged:{stage2_mode}", - strategy="staged", - stage2_mode=stage2_mode, - ) - ) - return specs - - -def _build_strategy_runtime(base_config: Config, strategy_spec: StrategySpec) -> StrategyRuntime: - runtime_config = deepcopy(base_config) - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - engine = ChainSearchEngine(registry=registry, mapper=mapper, config=runtime_config) - return StrategyRuntime( - strategy_spec=strategy_spec, - config=runtime_config, - registry=registry, - engine=engine, - ) - - -def _select_effective_method(query: str, requested_method: str) -> str: - requested = str(requested_method).strip().lower() - if requested != "auto": - return requested - if query_targets_generated_files(query) or query_prefers_lexical_search(query): - return "fts" - intent = detect_query_intent(query) - if intent == QueryIntent.KEYWORD: - return "fts" - if intent == QueryIntent.SEMANTIC: - return "dense_rerank" - return "hybrid" - - -def _filter_dataset_by_query_match( - dataset: Sequence[Dict[str, Any]], - query_match: Optional[str], -) -> List[Dict[str, Any]]: - """Filter labeled queries by case-insensitive substring match.""" - needle = str(query_match or "").strip().casefold() - if not needle: - return list(dataset) - return [ - dict(item) - for item in dataset - if needle in str(item.get("query", "")).casefold() - ] - - -def _apply_query_limit( - dataset: Sequence[Dict[str, Any]], - query_limit: Optional[int], -) -> List[Dict[str, Any]]: - """Apply the optional query limit after any dataset-level filtering.""" - if query_limit is None: - return list(dataset) - return [dict(item) for item in list(dataset)[: max(0, int(query_limit))]] - - -def _write_json_payload(path: Path, payload: Dict[str, Any]) -> None: - """Persist a benchmark payload as UTF-8 JSON.""" - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") - - -def _write_final_outputs( - *, - output_path: Path, - progress_output: Optional[Path], - payload: Dict[str, Any], -) -> None: - """Persist the final completed payload to both result and progress outputs.""" - _write_json_payload(output_path, payload) - if progress_output is not None: - _write_json_payload(progress_output, payload) - - -def _make_progress_payload( - *, - args: argparse.Namespace, - source_root: Path, - strategy_specs: Sequence[StrategySpec], - evaluations: Sequence[QueryEvaluation], - query_index: int, - total_queries: int, - run_index: int, - total_runs: int, - current_query: str, - current_strategy_key: str, -) -> Dict[str, Any]: - """Create a partial progress snapshot for long benchmark runs.""" - return { - "status": "running", - "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), - "source": str(source_root), - "queries_file": str(args.queries_file), - "query_count": len(evaluations), - "planned_query_count": total_queries, - "k": int(args.k), - "coarse_k": int(args.coarse_k), - "strategy_keys": [spec.strategy_key for spec in strategy_specs], - "progress": { - "completed_queries": query_index, - "total_queries": total_queries, - "completed_runs": run_index, - "total_runs": total_runs, - "current_query": current_query, - "current_strategy_key": current_strategy_key, - }, - "evaluations": [ - { - "query": evaluation.query, - "intent": evaluation.intent, - "notes": evaluation.notes, - "relevant_paths": evaluation.relevant_paths, - "runs": {key: asdict(run) for key, run in evaluation.runs.items()}, - } - for evaluation in evaluations - ], - } - - -def _make_search_options(method: str, *, k: int) -> SearchOptions: - normalized = str(method).strip().lower() - if normalized == "fts": - return SearchOptions( - total_limit=k, - hybrid_mode=False, - enable_fuzzy=False, - enable_vector=False, - pure_vector=False, - enable_cascade=False, - ) - if normalized == "hybrid": - return SearchOptions( - total_limit=k, - hybrid_mode=True, - enable_fuzzy=False, - enable_vector=True, - pure_vector=False, - enable_cascade=False, - ) - if normalized in {"dense_rerank", "staged"}: - return SearchOptions( - total_limit=k, - hybrid_mode=True, - enable_fuzzy=False, - enable_vector=True, - pure_vector=False, - enable_cascade=True, - ) - raise ValueError(f"Unsupported benchmark method: {method}") - - -def _run_strategy( - engine: ChainSearchEngine, - config: Config, - *, - strategy_spec: StrategySpec, - query: str, - source_path: Path, - k: int, - coarse_k: int, - relevant: set[str], -) -> StrategyRun: - gc.collect() - effective_method = _select_effective_method(query, strategy_spec.strategy) - execution_method = "cascade" if effective_method in {"dense_rerank", "staged"} else effective_method - previous_cascade_strategy = getattr(config, "cascade_strategy", None) - previous_stage2_mode = getattr(config, "staged_stage2_mode", None) - - start_ms = _now_ms() - try: - options = _make_search_options( - "staged" if strategy_spec.strategy == "staged" else effective_method, - k=k, - ) - if strategy_spec.strategy == "staged": - config.cascade_strategy = "staged" - if strategy_spec.stage2_mode: - config.staged_stage2_mode = strategy_spec.stage2_mode - result = engine.cascade_search( - query=query, - source_path=source_path, - k=k, - coarse_k=coarse_k, - options=options, - strategy="staged", - ) - elif effective_method == "dense_rerank": - config.cascade_strategy = "dense_rerank" - result = engine.cascade_search( - query=query, - source_path=source_path, - k=k, - coarse_k=coarse_k, - options=options, - strategy="dense_rerank", - ) - else: - result = engine.search( - query=query, - source_path=source_path, - options=options, - ) - latency_ms = _now_ms() - start_ms - paths_raw = [item.path for item in (result.results or []) if getattr(item, "path", None)] - topk = _dedup_topk((_normalize_path_key(path) for path in paths_raw), k=k) - rank = _first_hit_rank(topk, relevant) - recall = 0.0 - if relevant: - recall = len(set(topk) & relevant) / float(len(relevant)) - return StrategyRun( - strategy_key=strategy_spec.strategy_key, - strategy=strategy_spec.strategy, - stage2_mode=strategy_spec.stage2_mode, - effective_method=effective_method, - execution_method=execution_method, - latency_ms=latency_ms, - topk_paths=topk, - first_hit_rank=rank, - hit_at_k=rank is not None, - recall_at_k=recall, - generated_artifact_count=sum(1 for path in topk if is_generated_artifact_path(path)), - test_file_count=sum(1 for path in topk if is_test_file(path)), - error=None, - ) - except Exception as exc: - latency_ms = _now_ms() - start_ms - return StrategyRun( - strategy_key=strategy_spec.strategy_key, - strategy=strategy_spec.strategy, - stage2_mode=strategy_spec.stage2_mode, - effective_method=effective_method, - execution_method=execution_method, - latency_ms=latency_ms, - topk_paths=[], - first_hit_rank=None, - hit_at_k=False, - recall_at_k=0.0, - generated_artifact_count=0, - test_file_count=0, - error=f"{type(exc).__name__}: {exc}", - ) - finally: - config.cascade_strategy = previous_cascade_strategy - config.staged_stage2_mode = previous_stage2_mode - - -def _summarize_runs(runs: Sequence[StrategyRun]) -> Dict[str, Any]: - latencies = [run.latency_ms for run in runs if not run.error] - ranks = [run.first_hit_rank for run in runs] - effective_method_counts: Dict[str, int] = {} - for run in runs: - effective_method_counts[run.effective_method] = effective_method_counts.get(run.effective_method, 0) + 1 - return { - "query_count": len(runs), - "hit_at_k": _mean([1.0 if run.hit_at_k else 0.0 for run in runs]), - "mrr_at_k": _mrr(ranks), - "avg_recall_at_k": _mean([run.recall_at_k for run in runs]), - "avg_latency_ms": _mean(latencies), - "p50_latency_ms": _percentile(latencies, 0.50), - "p95_latency_ms": _percentile(latencies, 0.95), - "avg_generated_artifact_count": _mean([float(run.generated_artifact_count) for run in runs]), - "avg_test_file_count": _mean([float(run.test_file_count) for run in runs]), - "runs_with_generated_artifacts": sum(1 for run in runs if run.generated_artifact_count > 0), - "runs_with_test_files": sum(1 for run in runs if run.test_file_count > 0), - "effective_methods": effective_method_counts, - "errors": sum(1 for run in runs if run.error), - } - - -def _build_pairwise_deltas(stage2_summaries: Dict[str, Dict[str, Any]]) -> List[PairwiseDelta]: - modes = list(stage2_summaries.keys()) - deltas: List[PairwiseDelta] = [] - for left_index in range(len(modes)): - for right_index in range(left_index + 1, len(modes)): - left = modes[left_index] - right = modes[right_index] - left_summary = stage2_summaries[left] - right_summary = stage2_summaries[right] - deltas.append( - PairwiseDelta( - mode_a=left, - mode_b=right, - hit_at_k_delta=left_summary["hit_at_k"] - right_summary["hit_at_k"], - mrr_at_k_delta=left_summary["mrr_at_k"] - right_summary["mrr_at_k"], - avg_recall_at_k_delta=left_summary["avg_recall_at_k"] - right_summary["avg_recall_at_k"], - avg_latency_ms_delta=left_summary["avg_latency_ms"] - right_summary["avg_latency_ms"], - ) - ) - return deltas - - -def _make_plan_payload( - *, - args: argparse.Namespace, - source_root: Path, - dataset: Sequence[Dict[str, Any]], - baseline_methods: Sequence[str], - stage2_modes: Sequence[str], - strategy_specs: Sequence[StrategySpec], -) -> Dict[str, Any]: - return { - "mode": "dry-run" if args.dry_run else "self-check", - "local_only": True, - "source": str(source_root), - "queries_file": str(args.queries_file), - "query_count": len(dataset), - "query_match": args.query_match, - "k": int(args.k), - "coarse_k": int(args.coarse_k), - "baseline_methods": list(baseline_methods), - "stage2_modes": list(stage2_modes), - "strategy_keys": [spec.strategy_key for spec in strategy_specs], - "local_backends": { - "embedding_backend": args.embedding_backend, - "embedding_model": args.embedding_model, - "reranker_backend": args.reranker_backend, - "reranker_model": args.reranker_model, - "embedding_use_gpu": bool(args.embedding_use_gpu), - "reranker_use_gpu": bool(args.reranker_use_gpu), - }, - "output": str(args.output), - "progress_output": str(args.progress_output) if args.progress_output else None, - "dataset_preview": [ - { - "query": item.get("query"), - "intent": item.get("intent"), - "relevant_paths": item.get("relevant_paths"), - } - for item in list(dataset)[: min(3, len(dataset))] - ], - } - - -def build_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--source", - type=Path, - default=DEFAULT_SOURCE, - help="Source root to benchmark. Defaults to the repository root so CCW and CodexLens paths resolve together.", - ) - parser.add_argument( - "--queries-file", - type=Path, - default=DEFAULT_QUERIES_FILE, - help="Labeled JSONL dataset of CCW smart_search queries", - ) - parser.add_argument("--query-limit", type=int, default=None, help="Optional query limit") - parser.add_argument( - "--query-match", - type=str, - default=None, - help="Optional case-insensitive substring filter for selecting specific benchmark queries.", - ) - parser.add_argument("--k", type=int, default=10, help="Top-k to evaluate") - parser.add_argument("--coarse-k", type=int, default=100, help="Stage-1 coarse_k") - parser.add_argument( - "--baseline-methods", - nargs="*", - default=list(VALID_BASELINE_METHODS), - help="Requested smart_search baselines to compare before staged modes (valid: auto, fts, hybrid).", - ) - parser.add_argument( - "--stage2-modes", - nargs="*", - default=list(VALID_STAGE2_MODES), - help="Stage-2 modes to compare", - ) - parser.add_argument("--warmup", type=int, default=0, help="Warmup iterations per strategy") - parser.add_argument( - "--embedding-backend", - default="fastembed", - help="Local embedding backend. This runner only accepts fastembed.", - ) - parser.add_argument( - "--embedding-model", - default="code", - help="Embedding model/profile for the local embedding backend", - ) - parser.add_argument( - "--embedding-use-gpu", - action="store_true", - help="Enable GPU acceleration for local embeddings. Off by default for stability.", - ) - parser.add_argument( - "--reranker-backend", - default="onnx", - help="Local reranker backend. Supported local values: onnx, fastembed, legacy.", - ) - parser.add_argument( - "--reranker-model", - default=DEFAULT_LOCAL_ONNX_RERANKER_MODEL, - help="Reranker model name for the local reranker backend", - ) - parser.add_argument( - "--reranker-use-gpu", - action="store_true", - help="Enable GPU acceleration for the local reranker. Off by default for stability.", - ) - parser.add_argument( - "--skip-dense-baseline", - action="store_true", - help="Only compare staged stage2 modes and skip the dense_rerank baseline.", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="Validate dataset/config and print the benchmark plan without running retrieval.", - ) - parser.add_argument( - "--self-check", - action="store_true", - help="Smoke-check the entrypoint by validating dataset, source paths, and stage matrix wiring.", - ) - parser.add_argument( - "--output", - type=Path, - default=DEFAULT_OUTPUT, - help="Output JSON path", - ) - parser.add_argument( - "--progress-output", - type=Path, - default=None, - help="Optional JSON path updated after each query with partial progress and completed runs.", - ) - return parser - - -def main() -> None: - parser = build_parser() - args = parser.parse_args() - - source_root = args.source.expanduser().resolve() - if not source_root.exists(): - raise SystemExit(f"Source path does not exist: {source_root}") - if int(args.k) <= 0: - raise SystemExit("--k must be > 0") - if int(args.coarse_k) <= 0: - raise SystemExit("--coarse-k must be > 0") - if int(args.coarse_k) < int(args.k): - raise SystemExit("--coarse-k must be >= --k") - if int(args.warmup) < 0: - raise SystemExit("--warmup must be >= 0") - - embedding_backend = str(args.embedding_backend).strip().lower() - reranker_backend = str(args.reranker_backend).strip().lower() - _validate_local_only_backends(embedding_backend, reranker_backend) - baseline_methods = _validate_baseline_methods(args.baseline_methods) - stage2_modes = _validate_stage2_modes(args.stage2_modes) - - dataset = _load_labeled_queries(args.queries_file, None) - dataset = _filter_dataset_by_query_match(dataset, args.query_match) - dataset = _apply_query_limit(dataset, args.query_limit) - if not dataset: - raise SystemExit("No queries to run") - - missing_paths: List[str] = [] - for item in dataset: - _, _, item_missing = _resolve_expected_paths(source_root, [str(path) for path in item["relevant_paths"]]) - missing_paths.extend(item_missing) - if missing_paths: - preview = ", ".join(missing_paths[:3]) - raise SystemExit( - "Dataset relevant_paths do not resolve under the selected source root. " - f"Examples: {preview}" - ) - - strategy_specs = _strategy_specs( - stage2_modes, - include_dense_baseline=not args.skip_dense_baseline, - baseline_methods=baseline_methods, - ) - - if args.dry_run or args.self_check: - payload = _make_plan_payload( - args=args, - source_root=source_root, - dataset=dataset, - baseline_methods=baseline_methods, - stage2_modes=stage2_modes, - strategy_specs=strategy_specs, - ) - if args.self_check: - payload["status"] = "ok" - payload["checks"] = { - "dataset_loaded": True, - "stage2_matrix_size": len(stage2_modes), - "local_only_validation": True, - "source_path_exists": True, - } - print(json.dumps(payload, ensure_ascii=False, indent=2)) - return - - config = Config.load() - config.cascade_strategy = "staged" - config.enable_staged_rerank = True - config.enable_cross_encoder_rerank = True - config.embedding_backend = embedding_backend - config.embedding_model = str(args.embedding_model).strip() - config.embedding_use_gpu = bool(args.embedding_use_gpu) - config.embedding_auto_embed_missing = False - config.reranker_backend = reranker_backend - config.reranker_model = str(args.reranker_model).strip() - config.reranker_use_gpu = bool(args.reranker_use_gpu) - - strategy_runtimes = { - spec.strategy_key: _build_strategy_runtime(config, spec) - for spec in strategy_specs - } - - evaluations: List[QueryEvaluation] = [] - total_queries = len(dataset) - total_runs = total_queries * len(strategy_specs) - completed_runs = 0 - - try: - if int(args.warmup) > 0: - warm_query = str(dataset[0]["query"]).strip() - warm_relevant_paths = [str(path) for path in dataset[0]["relevant_paths"]] - _, warm_relevant, _ = _resolve_expected_paths(source_root, warm_relevant_paths) - for spec in strategy_specs: - runtime = strategy_runtimes[spec.strategy_key] - for _ in range(int(args.warmup)): - _run_strategy( - runtime.engine, - runtime.config, - strategy_spec=spec, - query=warm_query, - source_path=source_root, - k=min(int(args.k), 5), - coarse_k=min(int(args.coarse_k), 50), - relevant=warm_relevant, - ) - - for index, item in enumerate(dataset, start=1): - query = str(item.get("query", "")).strip() - if not query: - continue - print(f"[query {index}/{total_queries}] {query}", flush=True) - relevant_paths, relevant, _ = _resolve_expected_paths( - source_root, - [str(path) for path in item["relevant_paths"]], - ) - runs: Dict[str, StrategyRun] = {} - for spec in strategy_specs: - if args.progress_output is not None: - _write_json_payload( - args.progress_output, - _make_progress_payload( - args=args, - source_root=source_root, - strategy_specs=strategy_specs, - evaluations=evaluations, - query_index=index - 1, - total_queries=total_queries, - run_index=completed_runs, - total_runs=total_runs, - current_query=query, - current_strategy_key=spec.strategy_key, - ), - ) - print( - f"[run {completed_runs + 1}/{total_runs}] " - f"strategy={spec.strategy_key} query={query}", - flush=True, - ) - runtime = strategy_runtimes[spec.strategy_key] - runs[spec.strategy_key] = _run_strategy( - runtime.engine, - runtime.config, - strategy_spec=spec, - query=query, - source_path=source_root, - k=int(args.k), - coarse_k=int(args.coarse_k), - relevant=relevant, - ) - completed_runs += 1 - run = runs[spec.strategy_key] - outcome = "error" if run.error else "ok" - print( - f"[done {completed_runs}/{total_runs}] " - f"strategy={spec.strategy_key} outcome={outcome} " - f"latency_ms={run.latency_ms:.2f} " - f"first_hit_rank={run.first_hit_rank}", - flush=True, - ) - evaluations.append( - QueryEvaluation( - query=query, - intent=str(item.get("intent")) if item.get("intent") is not None else None, - notes=str(item.get("notes")) if item.get("notes") is not None else None, - relevant_paths=relevant_paths, - runs=runs, - ) - ) - if args.progress_output is not None: - _write_json_payload( - args.progress_output, - _make_progress_payload( - args=args, - source_root=source_root, - strategy_specs=strategy_specs, - evaluations=evaluations, - query_index=index, - total_queries=total_queries, - run_index=completed_runs, - total_runs=total_runs, - current_query=query, - current_strategy_key="complete", - ), - ) - finally: - for runtime in strategy_runtimes.values(): - try: - runtime.engine.close() - except Exception: - pass - for runtime in strategy_runtimes.values(): - try: - runtime.registry.close() - except Exception: - pass - - strategy_summaries: Dict[str, Dict[str, Any]] = {} - for spec in strategy_specs: - spec_runs = [evaluation.runs[spec.strategy_key] for evaluation in evaluations if spec.strategy_key in evaluation.runs] - summary = _summarize_runs(spec_runs) - summary["strategy"] = spec.strategy - summary["stage2_mode"] = spec.stage2_mode - strategy_summaries[spec.strategy_key] = summary - - stage2_mode_matrix = { - mode: strategy_summaries[f"staged:{mode}"] - for mode in stage2_modes - if f"staged:{mode}" in strategy_summaries - } - pairwise_deltas = [asdict(item) for item in _build_pairwise_deltas(stage2_mode_matrix)] - - payload = { - "status": "completed", - "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), - "source": str(source_root), - "queries_file": str(args.queries_file), - "query_count": len(evaluations), - "query_match": args.query_match, - "k": int(args.k), - "coarse_k": int(args.coarse_k), - "local_only": True, - "strategies": strategy_summaries, - "stage2_mode_matrix": stage2_mode_matrix, - "pairwise_stage2_deltas": pairwise_deltas, - "config": { - "embedding_backend": config.embedding_backend, - "embedding_model": config.embedding_model, - "embedding_use_gpu": bool(config.embedding_use_gpu), - "reranker_backend": config.reranker_backend, - "reranker_model": config.reranker_model, - "reranker_use_gpu": bool(config.reranker_use_gpu), - "enable_staged_rerank": bool(config.enable_staged_rerank), - "enable_cross_encoder_rerank": bool(config.enable_cross_encoder_rerank), - }, - "progress_output": str(args.progress_output) if args.progress_output else None, - "evaluations": [ - { - "query": evaluation.query, - "intent": evaluation.intent, - "notes": evaluation.notes, - "relevant_paths": evaluation.relevant_paths, - "runs": {key: asdict(run) for key, run in evaluation.runs.items()}, - } - for evaluation in evaluations - ], - } - - _write_final_outputs( - output_path=args.output, - progress_output=args.progress_output, - payload=payload, - ) - print(json.dumps(payload, ensure_ascii=False, indent=2)) - - -if __name__ == "__main__": - main() diff --git a/codex-lens/benchmarks/compare_semantic_methods.py b/codex-lens/benchmarks/compare_semantic_methods.py deleted file mode 100644 index da7b4873..00000000 --- a/codex-lens/benchmarks/compare_semantic_methods.py +++ /dev/null @@ -1,405 +0,0 @@ -"""Compare Binary Cascade and Vector semantic search methods. - -This script compares the two semantic retrieval approaches: -1. Binary Cascade: 256-bit binary vectors for coarse ranking -2. Vector Dense: Full semantic embeddings with cosine similarity -""" - -import sys -import time -from pathlib import Path - -# Add src to path -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from codexlens.storage.dir_index import DirIndexStore -from codexlens.semantic.vector_store import VectorStore - - -def get_filename(path: str) -> str: - """Extract filename from path.""" - if "\\" in path: - return path.split("\\")[-1] - elif "/" in path: - return path.split("/")[-1] - return path - - -def find_binary_indexes(index_root: Path): - """Find all binary index files.""" - return list(index_root.rglob("_index_binary_vectors.bin")) - - -# Test queries for semantic search comparison -TEST_QUERIES = [ - "how to search code semantically", - "embedding generation for files", - "hybrid search with multiple backends", - "parse python source code", - "database storage for vectors", -] - -# Index paths -INDEX_ROOT = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens") - - -def test_vector_search(query: str, limit: int = 10): - """Test dense vector search.""" - try: - from codexlens.semantic.factory import get_embedder - - # Find an index with embeddings - all_results = [] - total_time = 0 - - for index_db in INDEX_ROOT.rglob("_index.db"): - vector_store = VectorStore(index_db) - - if vector_store.count_chunks() == 0: - continue - - # Get embedder based on stored config - model_config = vector_store.get_model_config() - if model_config: - backend = model_config.get("backend", "fastembed") - model_name = model_config["model_name"] - model_profile = model_config["model_profile"] - if backend == "litellm": - embedder = get_embedder(backend="litellm", model=model_name) - else: - embedder = get_embedder(backend="fastembed", profile=model_profile) - else: - embedder = get_embedder(backend="fastembed", profile="code") - - start = time.perf_counter() - query_embedding = embedder.embed_single(query) - results = vector_store.search_similar( - query_embedding=query_embedding, - top_k=limit, - min_score=0.0, - return_full_content=True, - ) - total_time += (time.perf_counter() - start) * 1000 - all_results.extend(results) - - # Only need one successful search to get embedder initialized - if results: - break - - # Sort by score and limit - all_results.sort(key=lambda x: x.score, reverse=True) - return all_results[:limit], total_time, None - except Exception as e: - return [], 0, str(e) - - - -def test_binary_cascade_search(query: str, limit: int = 10): - """Test binary cascade search (binary coarse + dense fine ranking).""" - try: - from codexlens.semantic.ann_index import BinaryANNIndex - from codexlens.indexing.embedding import CascadeEmbeddingBackend - import numpy as np - import sqlite3 - - # Find binary indexes - binary_indexes = find_binary_indexes(INDEX_ROOT) - if not binary_indexes: - return [], 0, "No binary indexes found. Run 'codexlens cascade-index' first." - - start = time.perf_counter() - - # Initialize cascade backend for query encoding - cascade_backend = CascadeEmbeddingBackend() - - # Encode query to binary and dense - binary_embeddings, dense_embeddings = cascade_backend.encode_cascade([query], batch_size=1) - query_binary = binary_embeddings[0] - query_dense = dense_embeddings[0] - - all_results = [] - - for binary_index_path in binary_indexes: - # Find corresponding index.db - index_db = binary_index_path.parent / "_index.db" - if not index_db.exists(): - continue - - # Check if cascade embeddings exist - conn = sqlite3.connect(index_db) - conn.row_factory = sqlite3.Row - try: - cursor = conn.execute( - "SELECT COUNT(*) FROM semantic_chunks WHERE embedding_binary IS NOT NULL" - ) - binary_count = cursor.fetchone()[0] - if binary_count == 0: - conn.close() - continue - except Exception: - conn.close() - continue - - # Stage 1: Binary coarse search - binary_index = BinaryANNIndex(index_db, dim=256) - try: - binary_index.load() - except Exception: - conn.close() - continue - - # Pack query for binary search - from codexlens.indexing.embedding import pack_binary_embedding - query_binary_packed = pack_binary_embedding(query_binary) - - # Get top candidates - coarse_limit = min(limit * 10, 100) - # search returns (ids, distances) tuple - coarse_ids, coarse_distances = binary_index.search(query_binary_packed, top_k=coarse_limit) - - if not coarse_ids: - conn.close() - continue - - # Stage 2: Dense reranking - chunk_ids = coarse_ids - placeholders = ",".join("?" * len(chunk_ids)) - - cursor = conn.execute( - f""" - SELECT id, file_path, content, embedding_dense - FROM semantic_chunks - WHERE id IN ({placeholders}) AND embedding_dense IS NOT NULL - """, - chunk_ids - ) - rows = cursor.fetchall() - - # Compute dense scores - for row in rows: - chunk_id = row["id"] - file_path = row["file_path"] - content = row["content"] - dense_blob = row["embedding_dense"] - - if dense_blob: - dense_vec = np.frombuffer(dense_blob, dtype=np.float32) - # Cosine similarity - score = float(np.dot(query_dense, dense_vec) / ( - np.linalg.norm(query_dense) * np.linalg.norm(dense_vec) + 1e-8 - )) - else: - score = 0.0 - - all_results.append({ - "path": file_path, - "score": score, - "content": content[:200] + "..." if len(content) > 200 else content, - }) - - conn.close() - - # Sort by dense score and limit - all_results.sort(key=lambda x: x["score"], reverse=True) - final_results = all_results[:limit] - - elapsed = (time.perf_counter() - start) * 1000 - - return final_results, elapsed, None - except ImportError as e: - return [], 0, f"Import error: {e}" - except Exception as e: - import traceback - return [], 0, f"{str(e)}\n{traceback.format_exc()}" - - -def print_results(method_name: str, results, elapsed: float, error: str = None): - """Print search results in a formatted way.""" - print(f"\n{'='*60}") - print(f"Method: {method_name}") - print(f"{'='*60}") - - if error: - print(f"ERROR: {error}") - return - - print(f"Results: {len(results)}, Time: {elapsed:.1f}ms") - print("-" * 60) - - for i, r in enumerate(results[:5], 1): - if isinstance(r, dict): - path = r.get("path", "?") - score = r.get("score", 0) - content = r.get("content", "")[:80] - else: - path = getattr(r, "path", "?") - score = getattr(r, "score", 0) - content = getattr(r, "content", "")[:80] if hasattr(r, "content") else "" - - filename = get_filename(path) - print(f" {i}. [{score:.4f}] {filename}") - if content: - # Sanitize content for console output - safe_content = content.encode('ascii', 'replace').decode('ascii') - print(f" {safe_content}...") - - -def compare_overlap(results1, results2, name1: str, name2: str): - """Compare result overlap between two methods.""" - def get_paths(results): - paths = set() - for r in results[:10]: - if isinstance(r, dict): - paths.add(r.get("path", "")) - else: - paths.add(getattr(r, "path", "")) - return paths - - paths1 = get_paths(results1) - paths2 = get_paths(results2) - - if not paths1 or not paths2: - return 0.0 - - overlap = len(paths1 & paths2) - union = len(paths1 | paths2) - jaccard = overlap / union if union > 0 else 0.0 - - print(f" {name1} vs {name2}: {overlap} common files (Jaccard: {jaccard:.2f})") - return jaccard - - -def main(): - print("=" * 70) - print("SEMANTIC SEARCH METHODS COMPARISON") - print("Binary Cascade vs Vector Dense") - print("=" * 70) - - # Check prerequisites - print("\n[Prerequisites Check]") - print(f" Index Root: {INDEX_ROOT}") - - binary_indexes = find_binary_indexes(INDEX_ROOT) - print(f" Binary Indexes: {len(binary_indexes)} found") - for bi in binary_indexes[:3]: - print(f" - {bi.parent.name}/{bi.name}") - if len(binary_indexes) > 3: - print(f" ... and {len(binary_indexes) - 3} more") - - # Aggregate statistics - all_results = { - "binary": {"total_results": 0, "total_time": 0, "queries": 0, "errors": []}, - "vector": {"total_results": 0, "total_time": 0, "queries": 0, "errors": []}, - } - - overlap_scores = {"binary_vector": []} - - for query in TEST_QUERIES: - print(f"\n{'#'*70}") - print(f"QUERY: \"{query}\"") - print("#" * 70) - - # Test each method - binary_results, binary_time, binary_err = test_binary_cascade_search(query) - vector_results, vector_time, vector_err = test_vector_search(query) - - # Print results - print_results("Binary Cascade (256-bit + Dense Rerank)", binary_results, binary_time, binary_err) - print_results("Vector Dense (Semantic Embeddings)", vector_results, vector_time, vector_err) - - # Update statistics - if not binary_err: - all_results["binary"]["total_results"] += len(binary_results) - all_results["binary"]["total_time"] += binary_time - all_results["binary"]["queries"] += 1 - else: - all_results["binary"]["errors"].append(binary_err) - - if not vector_err: - all_results["vector"]["total_results"] += len(vector_results) - all_results["vector"]["total_time"] += vector_time - all_results["vector"]["queries"] += 1 - else: - all_results["vector"]["errors"].append(vector_err) - - # Compare overlap - print("\n[Result Overlap Analysis]") - if binary_results and vector_results: - j = compare_overlap(binary_results, vector_results, "Binary", "Vector") - overlap_scores["binary_vector"].append(j) - - # Print summary - print("\n" + "=" * 70) - print("SUMMARY STATISTICS") - print("=" * 70) - - for method, stats in all_results.items(): - queries = stats["queries"] - if queries > 0: - avg_results = stats["total_results"] / queries - avg_time = stats["total_time"] / queries - print(f"\n{method.upper()}:") - print(f" Successful queries: {queries}/{len(TEST_QUERIES)}") - print(f" Avg results: {avg_results:.1f}") - print(f" Avg time: {avg_time:.1f}ms") - else: - print(f"\n{method.upper()}: No successful queries") - if stats["errors"]: - # Show truncated error - err = stats["errors"][0] - if len(err) > 200: - err = err[:200] + "..." - print(f" Error: {err}") - - print("\n[Average Overlap Scores]") - for pair, scores in overlap_scores.items(): - if scores: - avg = sum(scores) / len(scores) - print(f" {pair}: {avg:.3f}") - - print("\n" + "=" * 70) - print("ANALYSIS") - print("=" * 70) - - # Analyze working methods - working_methods = [m for m, s in all_results.items() if s["queries"] > 0] - - if len(working_methods) == 2: - # All methods working - compare quality - print("\nBoth methods working. Quality comparison:") - - # Compare avg results - print("\n Result Coverage (higher = more recall):") - for m in ["vector", "binary"]: - stats = all_results[m] - if stats["queries"] > 0: - avg = stats["total_results"] / stats["queries"] - print(f" {m.upper()}: {avg:.1f} results/query") - - # Compare speed - print("\n Speed (lower = faster):") - for m in ["binary", "vector"]: - stats = all_results[m] - if stats["queries"] > 0: - avg = stats["total_time"] / stats["queries"] - print(f" {m.upper()}: {avg:.1f}ms") - - # Recommend fusion strategy - print("\n Recommended Fusion Strategy:") - print(" For quality-focused hybrid search:") - print(" 1. Run both methods in parallel") - print(" 2. Use RRF fusion with weights:") - print(" - Vector: 0.6 (best semantic understanding)") - print(" - Binary: 0.4 (fast coarse filtering)") - print(" 3. Apply CrossEncoder reranking on top-50") - - elif len(working_methods) >= 2: - print(f"\n{len(working_methods)} methods working: {', '.join(working_methods)}") - print("Consider fixing missing method for complete hybrid search.") - else: - print(f"\nOnly {working_methods[0] if working_methods else 'no'} method(s) working.") - print("Check your index setup.") - - -if __name__ == "__main__": - main() diff --git a/codex-lens/benchmarks/compare_staged_realtime_vs_dense_rerank.py b/codex-lens/benchmarks/compare_staged_realtime_vs_dense_rerank.py deleted file mode 100644 index fb6b26a1..00000000 --- a/codex-lens/benchmarks/compare_staged_realtime_vs_dense_rerank.py +++ /dev/null @@ -1,393 +0,0 @@ -#!/usr/bin/env python -"""Compare staged realtime LSP pipeline vs direct dense->rerank cascade. - -This benchmark compares two retrieval pipelines: -1) staged+realtime: coarse (binary or dense fallback) -> realtime LSP graph expand -> clustering -> rerank -2) dense_rerank: dense ANN coarse -> cross-encoder rerank - -Because most repos do not have ground-truth labels, this script reports: -- latency statistics -- top-k overlap metrics (Jaccard + RBO) -- diversity proxies (unique files/dirs) -- staged pipeline stage stats (if present) - -Usage: - python benchmarks/compare_staged_realtime_vs_dense_rerank.py --source ./src - python benchmarks/compare_staged_realtime_vs_dense_rerank.py --queries-file benchmarks/queries.txt -""" - -from __future__ import annotations - -import argparse -import gc -import json -import os -import re -import statistics -import sys -import time -from dataclasses import asdict, dataclass -from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Tuple - -# Add src to path (match other benchmark scripts) -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from codexlens.config import Config -from codexlens.search.chain_search import ChainSearchEngine, SearchOptions -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - - -DEFAULT_QUERIES = [ - "class Config", - "def search", - "LspBridge", - "graph expansion", - "clustering strategy", - "error handling", - "how to parse json", -] - - -def _now_ms() -> float: - return time.perf_counter() * 1000.0 - - -def _safe_relpath(path: str, root: Path) -> str: - try: - return str(Path(path).resolve().relative_to(root.resolve())) - except Exception: - return path - - -def _normalize_path_key(path: str) -> str: - """Normalize file paths for overlap/dedup metrics (Windows-safe).""" - try: - p = Path(path) - # Don't explode on non-files like "". - if str(p) and (p.is_absolute() or re.match(r"^[A-Za-z]:", str(p))): - norm = str(p.resolve()) - else: - norm = str(p) - except Exception: - norm = path - norm = norm.replace("/", "\\") - if os.name == "nt": - norm = norm.lower() - return norm - - -def _extract_stage_stats(errors: List[str]) -> Optional[Dict[str, Any]]: - """Extract STAGE_STATS JSON blob from SearchStats.errors.""" - for item in errors or []: - if not isinstance(item, str): - continue - if not item.startswith("STAGE_STATS:"): - continue - payload = item[len("STAGE_STATS:") :] - try: - return json.loads(payload) - except Exception: - return None - return None - - -def jaccard_topk(a: List[str], b: List[str]) -> float: - sa, sb = set(a), set(b) - if not sa and not sb: - return 1.0 - if not sa or not sb: - return 0.0 - return len(sa & sb) / len(sa | sb) - - -def rbo(a: List[str], b: List[str], p: float = 0.9) -> float: - """Rank-biased overlap for two ranked lists.""" - if p <= 0.0 or p >= 1.0: - raise ValueError("p must be in (0, 1)") - if not a and not b: - return 1.0 - - depth = max(len(a), len(b)) - seen_a: set[str] = set() - seen_b: set[str] = set() - - score = 0.0 - for d in range(1, depth + 1): - if d <= len(a): - seen_a.add(a[d - 1]) - if d <= len(b): - seen_b.add(b[d - 1]) - overlap = len(seen_a & seen_b) - score += (overlap / d) * ((1.0 - p) * (p ** (d - 1))) - return score - - -def _unique_parent_dirs(paths: Iterable[str]) -> int: - dirs = set() - for p in paths: - try: - dirs.add(str(Path(p).parent)) - except Exception: - continue - return len(dirs) - - -@dataclass -class RunDetail: - strategy: str - query: str - latency_ms: float - num_results: int - topk_paths: List[str] - stage_stats: Optional[Dict[str, Any]] = None - error: Optional[str] = None - - -@dataclass -class CompareDetail: - query: str - staged: RunDetail - dense_rerank: RunDetail - jaccard_topk: float - rbo_topk: float - staged_unique_files_topk: int - dense_unique_files_topk: int - staged_unique_dirs_topk: int - dense_unique_dirs_topk: int - - -def _run_once( - engine: ChainSearchEngine, - query: str, - source_path: Path, - *, - strategy: str, - k: int, - coarse_k: int, - options: Optional[SearchOptions] = None, -) -> RunDetail: - gc.collect() - start_ms = _now_ms() - try: - result = engine.cascade_search( - query=query, - source_path=source_path, - k=k, - coarse_k=coarse_k, - options=options, - strategy=strategy, - ) - latency_ms = _now_ms() - start_ms - paths_raw = [r.path for r in (result.results or []) if getattr(r, "path", None)] - paths = [_normalize_path_key(p) for p in paths_raw] - topk: List[str] = [] - seen: set[str] = set() - for p in paths: - if p in seen: - continue - seen.add(p) - topk.append(p) - if len(topk) >= k: - break - stage_stats = _extract_stage_stats(getattr(result.stats, "errors", [])) - return RunDetail( - strategy=strategy, - query=query, - latency_ms=latency_ms, - num_results=len(paths), - topk_paths=topk, - stage_stats=stage_stats, - ) - except Exception as exc: - latency_ms = _now_ms() - start_ms - return RunDetail( - strategy=strategy, - query=query, - latency_ms=latency_ms, - num_results=0, - topk_paths=[], - stage_stats=None, - error=repr(exc), - ) - - -def _load_queries(path: Optional[Path], limit: Optional[int]) -> List[str]: - if path is None: - queries = list(DEFAULT_QUERIES) - else: - raw = path.read_text(encoding="utf-8", errors="ignore").splitlines() - queries = [] - for line in raw: - line = line.strip() - if not line or line.startswith("#"): - continue - queries.append(line) - if limit is not None: - return queries[:limit] - return queries - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Compare staged realtime LSP pipeline vs direct dense_rerank cascade" - ) - parser.add_argument( - "--source", - type=Path, - default=Path(__file__).parent.parent / "src", - help="Source directory to search (default: ./src)", - ) - parser.add_argument( - "--queries-file", - type=Path, - default=None, - help="Optional file with one query per line (# comments supported)", - ) - parser.add_argument("--queries", type=int, default=None, help="Limit number of queries") - parser.add_argument("--k", type=int, default=10, help="Final result count (default 10)") - parser.add_argument("--coarse-k", type=int, default=100, help="Coarse candidates (default 100)") - parser.add_argument("--warmup", type=int, default=1, help="Warmup runs per strategy (default 1)") - parser.add_argument( - "--staged-cluster-strategy", - type=str, - default=None, - help="Override Config.staged_clustering_strategy for staged pipeline (e.g. auto, dir_rr, score, path)", - ) - parser.add_argument( - "--output", - type=Path, - default=Path(__file__).parent / "results" / "staged_realtime_vs_dense_rerank.json", - help="Output JSON path", - ) - args = parser.parse_args() - - if not args.source.exists(): - raise SystemExit(f"Source path does not exist: {args.source}") - - queries = _load_queries(args.queries_file, args.queries) - if not queries: - raise SystemExit("No queries to run") - - # Match CLI behavior: load settings + apply global/workspace .env overrides. - # This is important on Windows where ONNX/DirectML can sometimes crash under load; - # many users pin EMBEDDING_BACKEND=litellm in ~/.codexlens/.env for stability. - config = Config.load() - config.cascade_strategy = "staged" - config.staged_stage2_mode = "realtime" - config.enable_staged_rerank = True - if args.staged_cluster_strategy: - config.staged_clustering_strategy = str(args.staged_cluster_strategy) - # Stability: on some Windows setups, fastembed + DirectML can crash under load. - # Force local embeddings and reranking onto CPU for reproducible benchmark runs. - config.embedding_use_gpu = False - config.reranker_use_gpu = False - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config) - - try: - strategies = ["staged", "dense_rerank"] - - # Warmup - if args.warmup > 0: - warm_query = queries[0] - for s in strategies: - for _ in range(args.warmup): - try: - _run_once( - engine, - warm_query, - args.source, - strategy=s, - k=min(args.k, 5), - coarse_k=min(args.coarse_k, 50), - ) - except Exception: - pass - - comparisons: List[CompareDetail] = [] - - for i, query in enumerate(queries, start=1): - print(f"[{i}/{len(queries)}] {query}") - - staged = _run_once( - engine, - query, - args.source, - strategy="staged", - k=args.k, - coarse_k=args.coarse_k, - ) - dense = _run_once( - engine, - query, - args.source, - strategy="dense_rerank", - k=args.k, - coarse_k=args.coarse_k, - ) - - staged_paths = staged.topk_paths - dense_paths = dense.topk_paths - - comparisons.append( - CompareDetail( - query=query, - staged=staged, - dense_rerank=dense, - jaccard_topk=jaccard_topk(staged_paths, dense_paths), - rbo_topk=rbo(staged_paths, dense_paths, p=0.9), - staged_unique_files_topk=len(set(staged_paths)), - dense_unique_files_topk=len(set(dense_paths)), - staged_unique_dirs_topk=_unique_parent_dirs(staged_paths), - dense_unique_dirs_topk=_unique_parent_dirs(dense_paths), - ) - ) - - def _latencies(details: List[RunDetail]) -> List[float]: - return [d.latency_ms for d in details if not d.error] - - staged_runs = [c.staged for c in comparisons] - dense_runs = [c.dense_rerank for c in comparisons] - - summary = { - "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), - "source": str(args.source), - "k": args.k, - "coarse_k": args.coarse_k, - "query_count": len(comparisons), - "avg_jaccard_topk": statistics.mean([c.jaccard_topk for c in comparisons]) if comparisons else 0.0, - "avg_rbo_topk": statistics.mean([c.rbo_topk for c in comparisons]) if comparisons else 0.0, - "staged": { - "success": sum(1 for r in staged_runs if not r.error), - "avg_latency_ms": statistics.mean(_latencies(staged_runs)) if _latencies(staged_runs) else 0.0, - }, - "dense_rerank": { - "success": sum(1 for r in dense_runs if not r.error), - "avg_latency_ms": statistics.mean(_latencies(dense_runs)) if _latencies(dense_runs) else 0.0, - }, - } - - args.output.parent.mkdir(parents=True, exist_ok=True) - payload = { - "summary": summary, - "comparisons": [asdict(c) for c in comparisons], - } - args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8") - print(f"\nSaved: {args.output}") - finally: - try: - engine.close() - except Exception as exc: - print(f"WARNING engine.close() failed: {exc!r}", file=sys.stderr) - try: - registry.close() - except Exception as exc: - print(f"WARNING registry.close() failed: {exc!r}", file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/codex-lens/benchmarks/compare_staged_stage2_modes.py b/codex-lens/benchmarks/compare_staged_stage2_modes.py deleted file mode 100644 index 893b988c..00000000 --- a/codex-lens/benchmarks/compare_staged_stage2_modes.py +++ /dev/null @@ -1,391 +0,0 @@ -#!/usr/bin/env python -"""Compare staged cascade Stage-2 modes (precomputed vs realtime vs static graph). - -This benchmark compares the *same* staged cascade strategy with different Stage-2 -expansion sources: - -1) precomputed: per-dir `graph_neighbors` expansion (fast, index-local) -2) realtime: live LSP graph expansion (contextual, requires LSP availability) -3) static_global_graph: global_relationships expansion (project-wide, requires static graph indexing) - -Because most repos do not have ground-truth labels, this script reports: -- latency statistics per mode -- top-k overlap metrics (Jaccard + RBO) between modes -- diversity proxies (unique files/dirs) -- staged pipeline stage stats (when present) - -Usage: - python benchmarks/compare_staged_stage2_modes.py --source ./src - python benchmarks/compare_staged_stage2_modes.py --queries-file benchmarks/queries.txt -""" - -from __future__ import annotations - -import argparse -import gc -import json -import os -import re -import statistics -import sys -import time -from dataclasses import asdict, dataclass -from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Tuple - -# Add src to path (match other benchmark scripts) -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from codexlens.config import Config -from codexlens.search.chain_search import ChainSearchEngine -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - - -DEFAULT_QUERIES = [ - "class Config", - "def search", - "LspBridge", - "graph expansion", - "static graph relationships", - "clustering strategy", - "error handling", -] - - -VALID_STAGE2_MODES = ("precomputed", "realtime", "static_global_graph") - - -def _now_ms() -> float: - return time.perf_counter() * 1000.0 - - -def _normalize_path_key(path: str) -> str: - """Normalize file paths for overlap/dedup metrics (Windows-safe).""" - try: - p = Path(path) - if str(p) and (p.is_absolute() or re.match(r"^[A-Za-z]:", str(p))): - norm = str(p.resolve()) - else: - norm = str(p) - except Exception: - norm = path - norm = norm.replace("/", "\\") - if os.name == "nt": - norm = norm.lower() - return norm - - -def _extract_stage_stats(errors: List[str]) -> Optional[Dict[str, Any]]: - """Extract STAGE_STATS JSON blob from SearchStats.errors.""" - for item in errors or []: - if not isinstance(item, str): - continue - if not item.startswith("STAGE_STATS:"): - continue - payload = item[len("STAGE_STATS:") :] - try: - return json.loads(payload) - except Exception: - return None - return None - - -def jaccard_topk(a: List[str], b: List[str]) -> float: - sa, sb = set(a), set(b) - if not sa and not sb: - return 1.0 - if not sa or not sb: - return 0.0 - return len(sa & sb) / len(sa | sb) - - -def rbo(a: List[str], b: List[str], p: float = 0.9) -> float: - """Rank-biased overlap for two ranked lists.""" - if p <= 0.0 or p >= 1.0: - raise ValueError("p must be in (0, 1)") - if not a and not b: - return 1.0 - - depth = max(len(a), len(b)) - seen_a: set[str] = set() - seen_b: set[str] = set() - - score = 0.0 - for d in range(1, depth + 1): - if d <= len(a): - seen_a.add(a[d - 1]) - if d <= len(b): - seen_b.add(b[d - 1]) - overlap = len(seen_a & seen_b) - score += (overlap / d) * ((1.0 - p) * (p ** (d - 1))) - return score - - -def _unique_parent_dirs(paths: Iterable[str]) -> int: - dirs = set() - for p in paths: - try: - dirs.add(str(Path(p).parent)) - except Exception: - continue - return len(dirs) - - -def _load_queries(path: Optional[Path], inline: Optional[List[str]]) -> List[str]: - if inline: - return [q.strip() for q in inline if isinstance(q, str) and q.strip()] - if path: - if not path.exists(): - raise SystemExit(f"Queries file does not exist: {path}") - raw = path.read_text(encoding="utf-8", errors="ignore") - queries = [line.strip() for line in raw.splitlines() if line.strip() and not line.strip().startswith("#")] - return queries - return list(DEFAULT_QUERIES) - - -@dataclass -class RunDetail: - stage2_mode: str - query: str - latency_ms: float - num_results: int - topk_paths: List[str] - stage_stats: Optional[Dict[str, Any]] = None - error: Optional[str] = None - - -@dataclass -class PairwiseCompare: - query: str - mode_a: str - mode_b: str - jaccard_topk: float - rbo_topk: float - a_unique_files_topk: int - b_unique_files_topk: int - a_unique_dirs_topk: int - b_unique_dirs_topk: int - - -def _run_once( - engine: ChainSearchEngine, - config: Config, - query: str, - source_path: Path, - *, - stage2_mode: str, - k: int, - coarse_k: int, -) -> RunDetail: - if stage2_mode not in VALID_STAGE2_MODES: - raise ValueError(f"Invalid stage2_mode: {stage2_mode}") - - # Mutate config for this run; ChainSearchEngine reads config fields per-call. - config.staged_stage2_mode = stage2_mode - - gc.collect() - start_ms = _now_ms() - try: - result = engine.cascade_search( - query=query, - source_path=source_path, - k=k, - coarse_k=coarse_k, - strategy="staged", - ) - latency_ms = _now_ms() - start_ms - paths_raw = [r.path for r in (result.results or []) if getattr(r, "path", None)] - paths = [_normalize_path_key(p) for p in paths_raw] - - topk: List[str] = [] - seen: set[str] = set() - for p in paths: - if p in seen: - continue - seen.add(p) - topk.append(p) - if len(topk) >= k: - break - - stage_stats = None - try: - stage_stats = _extract_stage_stats(getattr(result.stats, "errors", []) or []) - except Exception: - stage_stats = None - - return RunDetail( - stage2_mode=stage2_mode, - query=query, - latency_ms=latency_ms, - num_results=len(result.results or []), - topk_paths=topk, - stage_stats=stage_stats, - error=None, - ) - except Exception as exc: - return RunDetail( - stage2_mode=stage2_mode, - query=query, - latency_ms=_now_ms() - start_ms, - num_results=0, - topk_paths=[], - stage_stats=None, - error=str(exc), - ) - - -def main() -> None: - parser = argparse.ArgumentParser(description="Compare staged Stage-2 expansion modes.") - parser.add_argument("--source", type=Path, default=Path.cwd(), help="Project path to search") - parser.add_argument("--queries-file", type=Path, default=None, help="Optional newline-delimited queries file") - parser.add_argument("--queries", nargs="*", default=None, help="Inline queries (overrides queries-file)") - parser.add_argument("--k", type=int, default=20, help="Top-k to evaluate") - parser.add_argument("--coarse-k", type=int, default=100, help="Stage-1 coarse_k") - parser.add_argument( - "--stage2-modes", - nargs="*", - default=list(VALID_STAGE2_MODES), - help="Stage-2 modes to compare", - ) - parser.add_argument("--warmup", type=int, default=0, help="Warmup iterations per mode") - parser.add_argument( - "--output", - type=Path, - default=Path(__file__).parent / "results" / "staged_stage2_modes.json", - help="Output JSON path", - ) - args = parser.parse_args() - - if not args.source.exists(): - raise SystemExit(f"Source path does not exist: {args.source}") - - stage2_modes = [str(m).strip().lower() for m in (args.stage2_modes or []) if str(m).strip()] - for m in stage2_modes: - if m not in VALID_STAGE2_MODES: - raise SystemExit(f"Invalid --stage2-modes entry: {m} (valid: {', '.join(VALID_STAGE2_MODES)})") - - queries = _load_queries(args.queries_file, args.queries) - if not queries: - raise SystemExit("No queries to run") - - # Match CLI behavior: load settings + apply global/workspace .env overrides. - config = Config.load() - config.cascade_strategy = "staged" - config.enable_staged_rerank = True - config.embedding_use_gpu = False # stability on some Windows setups - - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config) - - try: - # Warmup - if args.warmup > 0: - warm_query = queries[0] - for mode in stage2_modes: - for _ in range(args.warmup): - try: - _run_once( - engine, - config, - warm_query, - args.source, - stage2_mode=mode, - k=min(args.k, 5), - coarse_k=min(args.coarse_k, 50), - ) - except Exception: - pass - - per_query: Dict[str, Dict[str, RunDetail]] = {} - runs: List[RunDetail] = [] - comparisons: List[PairwiseCompare] = [] - - for i, query in enumerate(queries, start=1): - print(f"[{i}/{len(queries)}] {query}") - per_query[query] = {} - - for mode in stage2_modes: - detail = _run_once( - engine, - config, - query, - args.source, - stage2_mode=mode, - k=args.k, - coarse_k=args.coarse_k, - ) - per_query[query][mode] = detail - runs.append(detail) - - # Pairwise overlaps for this query - for a_idx in range(len(stage2_modes)): - for b_idx in range(a_idx + 1, len(stage2_modes)): - mode_a = stage2_modes[a_idx] - mode_b = stage2_modes[b_idx] - a = per_query[query][mode_a] - b = per_query[query][mode_b] - comparisons.append( - PairwiseCompare( - query=query, - mode_a=mode_a, - mode_b=mode_b, - jaccard_topk=jaccard_topk(a.topk_paths, b.topk_paths), - rbo_topk=rbo(a.topk_paths, b.topk_paths, p=0.9), - a_unique_files_topk=len(set(a.topk_paths)), - b_unique_files_topk=len(set(b.topk_paths)), - a_unique_dirs_topk=_unique_parent_dirs(a.topk_paths), - b_unique_dirs_topk=_unique_parent_dirs(b.topk_paths), - ) - ) - - def _latencies(details: List[RunDetail]) -> List[float]: - return [d.latency_ms for d in details if not d.error] - - mode_summaries: Dict[str, Dict[str, Any]] = {} - for mode in stage2_modes: - mode_runs = [r for r in runs if r.stage2_mode == mode] - lat = _latencies(mode_runs) - mode_summaries[mode] = { - "success": sum(1 for r in mode_runs if not r.error), - "avg_latency_ms": statistics.mean(lat) if lat else 0.0, - "p50_latency_ms": statistics.median(lat) if lat else 0.0, - "p95_latency_ms": statistics.quantiles(lat, n=20)[18] if len(lat) >= 2 else (lat[0] if lat else 0.0), - } - - summary = { - "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), - "source": str(args.source), - "k": args.k, - "coarse_k": args.coarse_k, - "query_count": len(queries), - "stage2_modes": stage2_modes, - "modes": mode_summaries, - "avg_pairwise_jaccard_topk": statistics.mean([c.jaccard_topk for c in comparisons]) if comparisons else 0.0, - "avg_pairwise_rbo_topk": statistics.mean([c.rbo_topk for c in comparisons]) if comparisons else 0.0, - } - - args.output.parent.mkdir(parents=True, exist_ok=True) - payload = { - "summary": summary, - "runs": [asdict(r) for r in runs], - "comparisons": [asdict(c) for c in comparisons], - } - args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8") - print(f"\nSaved: {args.output}") - finally: - try: - engine.close() - except Exception as exc: - print(f"WARNING engine.close() failed: {exc!r}", file=sys.stderr) - try: - registry.close() - except Exception as exc: - print(f"WARNING registry.close() failed: {exc!r}", file=sys.stderr) - - -if __name__ == "__main__": - main() - diff --git a/codex-lens/benchmarks/method_contribution_analysis.py b/codex-lens/benchmarks/method_contribution_analysis.py deleted file mode 100644 index e16abe6a..00000000 --- a/codex-lens/benchmarks/method_contribution_analysis.py +++ /dev/null @@ -1,527 +0,0 @@ -"""Analysis script for hybrid search method contribution and storage architecture. - -This script analyzes: -1. Individual method contribution in hybrid search (FTS/Vector) -2. Storage architecture conflicts between different retrieval methods -3. FTS + Rerank fusion experiment -""" - -import json -import sqlite3 -import time -from pathlib import Path -from typing import Dict, List, Tuple, Any -from collections import defaultdict - -# Add project root to path -import sys -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from codexlens.storage.registry import RegistryStore -from codexlens.storage.path_mapper import PathMapper -from codexlens.search.hybrid_search import HybridSearchEngine -from codexlens.search.ranking import ( - reciprocal_rank_fusion, - cross_encoder_rerank, - DEFAULT_WEIGHTS, -) -from codexlens.entities import SearchResult - - -def find_project_index(source_path: Path) -> Path: - """Find the index database for a project.""" - registry = RegistryStore() - registry.initialize() - - mapper = PathMapper() - index_path = mapper.source_to_index_db(source_path) - - if not index_path.exists(): - nearest = registry.find_nearest_index(source_path) - if nearest: - index_path = nearest.index_path - - registry.close() - return index_path - - -def analyze_storage_architecture(index_path: Path) -> Dict[str, Any]: - """Analyze storage tables and check for conflicts. - - Returns: - Dictionary with table analysis and conflict detection. - """ - results = { - "tables": {}, - "conflicts": [], - "recommendations": [] - } - - with sqlite3.connect(index_path) as conn: - # Get all tables - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name" - ) - tables = [row[0] for row in cursor.fetchall()] - - for table in tables: - # Get row count and columns - try: - count = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0] - cols = conn.execute(f"PRAGMA table_info({table})").fetchall() - col_names = [c[1] for c in cols] - - results["tables"][table] = { - "row_count": count, - "columns": col_names - } - except Exception as e: - results["tables"][table] = {"error": str(e)} - - # Check for data overlap/conflicts - # 1. Check if chunks and semantic_chunks have different data - if "chunks" in tables and "semantic_chunks" in tables: - chunks_count = results["tables"]["chunks"]["row_count"] - semantic_count = results["tables"]["semantic_chunks"]["row_count"] - - if chunks_count > 0 and semantic_count > 0: - # Check for ID overlap - overlap = conn.execute(""" - SELECT COUNT(*) FROM chunks c - JOIN semantic_chunks sc ON c.id = sc.id - """).fetchone()[0] - - results["conflicts"].append({ - "type": "table_overlap", - "tables": ["chunks", "semantic_chunks"], - "chunks_count": chunks_count, - "semantic_count": semantic_count, - "id_overlap": overlap, - "description": ( - f"Both chunks ({chunks_count}) and semantic_chunks ({semantic_count}) " - f"have data. ID overlap: {overlap}. " - "This can cause confusion - binary_cascade reads from semantic_chunks " - "but SQLiteStore reads from chunks." - ) - }) - elif chunks_count == 0 and semantic_count > 0: - results["recommendations"].append( - "chunks table is empty but semantic_chunks has data. " - "Use cascade-index (semantic_chunks) for better semantic search." - ) - elif chunks_count > 0 and semantic_count == 0: - results["recommendations"].append( - "semantic_chunks is empty. Run 'codexlens cascade-index' to enable " - "binary cascade search." - ) - - # 2. Check FTS tables - fts_tables = [t for t in tables if t.startswith("files_fts")] - if len(fts_tables) >= 2: - results["recommendations"].append( - f"Found {len(fts_tables)} FTS tables: {fts_tables}. " - "Dual FTS (exact + fuzzy) is properly configured." - ) - - return results - - -def analyze_method_contributions( - index_path: Path, - queries: List[str], - limit: int = 20 -) -> Dict[str, Any]: - """Analyze contribution of each retrieval method. - - Runs each method independently and measures: - - Result count - - Latency - - Score distribution - - Overlap with other methods - """ - results = { - "per_query": [], - "summary": {} - } - - for query in queries: - query_result = { - "query": query, - "methods": {}, - "fusion_analysis": {} - } - - # Run each method independently - methods = { - "fts_exact": {"fuzzy": False, "vector": False}, - "fts_fuzzy": {"fuzzy": True, "vector": False}, - "vector": {"fuzzy": False, "vector": True}, - } - - method_results: Dict[str, List[SearchResult]] = {} - - for method_name, config in methods.items(): - try: - engine = HybridSearchEngine() - - # Set config to disable/enable specific backends - engine._config = type('obj', (object,), { - 'use_fts_fallback': method_name.startswith("fts"), - 'embedding_use_gpu': True, - })() - - start = time.perf_counter() - - if method_name == "fts_exact": - # Force FTS fallback mode with fuzzy disabled - engine.weights = DEFAULT_WEIGHTS.copy() - results_list = engine.search( - index_path, query, limit=limit, - enable_fuzzy=False, enable_vector=False, pure_vector=False - ) - elif method_name == "fts_fuzzy": - engine.weights = DEFAULT_WEIGHTS.copy() - results_list = engine.search( - index_path, query, limit=limit, - enable_fuzzy=True, enable_vector=False, pure_vector=False - ) - elif method_name == "vector": - results_list = engine.search( - index_path, query, limit=limit, - enable_fuzzy=False, enable_vector=True, pure_vector=True - ) - else: - results_list = [] - - latency = (time.perf_counter() - start) * 1000 - - method_results[method_name] = results_list - - scores = [r.score for r in results_list] - query_result["methods"][method_name] = { - "count": len(results_list), - "latency_ms": latency, - "avg_score": sum(scores) / len(scores) if scores else 0, - "max_score": max(scores) if scores else 0, - "min_score": min(scores) if scores else 0, - "top_3_files": [r.path.split("\\")[-1] for r in results_list[:3]] - } - - except Exception as e: - query_result["methods"][method_name] = { - "error": str(e), - "count": 0 - } - - # Compute overlap between methods - method_paths = { - name: set(r.path for r in results) - for name, results in method_results.items() - if results - } - - overlaps = {} - method_names = list(method_paths.keys()) - for i, m1 in enumerate(method_names): - for m2 in method_names[i+1:]: - overlap = len(method_paths[m1] & method_paths[m2]) - union = len(method_paths[m1] | method_paths[m2]) - jaccard = overlap / union if union > 0 else 0 - overlaps[f"{m1}_vs_{m2}"] = { - "overlap_count": overlap, - "jaccard": jaccard, - f"{m1}_unique": len(method_paths[m1] - method_paths[m2]), - f"{m2}_unique": len(method_paths[m2] - method_paths[m1]), - } - - query_result["overlaps"] = overlaps - - # Analyze RRF fusion contribution - if len(method_results) >= 2: - # Compute RRF with each method's contribution - rrf_map = {} - for name, results in method_results.items(): - if results and name in ["fts_exact", "vector"]: - # Rename for RRF - rrf_name = name.replace("fts_exact", "exact") - rrf_map[rrf_name] = results - - if rrf_map: - fused = reciprocal_rank_fusion(rrf_map, k=60) - - # Analyze which methods contributed to top results - source_contributions = defaultdict(int) - for r in fused[:10]: - source_ranks = r.metadata.get("source_ranks", {}) - for source in source_ranks: - source_contributions[source] += 1 - - query_result["fusion_analysis"] = { - "total_fused": len(fused), - "top_10_source_distribution": dict(source_contributions) - } - - results["per_query"].append(query_result) - - # Compute summary statistics - method_stats = defaultdict(lambda: {"counts": [], "latencies": []}) - for qr in results["per_query"]: - for method, data in qr["methods"].items(): - if "count" in data: - method_stats[method]["counts"].append(data["count"]) - if "latency_ms" in data: - method_stats[method]["latencies"].append(data["latency_ms"]) - - results["summary"] = { - method: { - "avg_count": sum(s["counts"]) / len(s["counts"]) if s["counts"] else 0, - "avg_latency_ms": sum(s["latencies"]) / len(s["latencies"]) if s["latencies"] else 0, - } - for method, s in method_stats.items() - } - - return results - - -def experiment_fts_rerank_fusion( - index_path: Path, - queries: List[str], - limit: int = 10, - coarse_k: int = 50 -) -> Dict[str, Any]: - """Experiment: FTS + Rerank fusion vs standard hybrid. - - Compares: - 1. Standard Hybrid (FTS + Vector RRF) - 2. FTS + CrossEncoder Rerank -> then fuse with Vector - """ - results = { - "per_query": [], - "summary": {} - } - - # Initialize reranker - try: - from codexlens.semantic.reranker import get_reranker, check_reranker_available - ok, _ = check_reranker_available("onnx") - if ok: - reranker = get_reranker(backend="onnx", use_gpu=True) - else: - reranker = None - except Exception as e: - print(f"Reranker unavailable: {e}") - reranker = None - - for query in queries: - query_result = { - "query": query, - "strategies": {} - } - - # Strategy 1: Standard Hybrid (FTS + Vector) - try: - engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS) - engine._config = type('obj', (object,), { - 'use_fts_fallback': False, - 'embedding_use_gpu': True, - })() - - start = time.perf_counter() - standard_results = engine.search( - index_path, query, limit=limit, - enable_vector=True - ) - standard_latency = (time.perf_counter() - start) * 1000 - - query_result["strategies"]["standard_hybrid"] = { - "count": len(standard_results), - "latency_ms": standard_latency, - "top_5": [r.path.split("\\")[-1] for r in standard_results[:5]], - "scores": [r.score for r in standard_results[:5]] - } - except Exception as e: - query_result["strategies"]["standard_hybrid"] = {"error": str(e)} - - # Strategy 2: FTS + Rerank -> Fuse with Vector - try: - # Step 1: Get FTS results (coarse) - fts_engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS) - fts_engine._config = type('obj', (object,), { - 'use_fts_fallback': True, - 'embedding_use_gpu': True, - })() - - start = time.perf_counter() - fts_results = fts_engine.search( - index_path, query, limit=coarse_k, - enable_fuzzy=True, enable_vector=False - ) - fts_latency = (time.perf_counter() - start) * 1000 - - # Step 2: Rerank FTS results with CrossEncoder - if reranker and fts_results: - rerank_start = time.perf_counter() - reranked_fts = cross_encoder_rerank( - query, fts_results, reranker, top_k=20 - ) - rerank_latency = (time.perf_counter() - rerank_start) * 1000 - else: - reranked_fts = fts_results[:20] - rerank_latency = 0 - - # Step 3: Get Vector results - vector_engine = HybridSearchEngine() - vector_results = vector_engine.search( - index_path, query, limit=20, - enable_vector=True, pure_vector=True - ) - - # Step 4: Fuse reranked FTS with Vector - if reranked_fts and vector_results: - fusion_map = { - "fts_reranked": reranked_fts, - "vector": vector_results - } - fused_results = reciprocal_rank_fusion( - fusion_map, - weights={"fts_reranked": 0.5, "vector": 0.5}, - k=60 - ) - else: - fused_results = reranked_fts or vector_results or [] - - total_latency = fts_latency + rerank_latency + (time.perf_counter() - start) * 1000 - - query_result["strategies"]["fts_rerank_fusion"] = { - "count": len(fused_results), - "total_latency_ms": fts_latency + rerank_latency, - "fts_latency_ms": fts_latency, - "rerank_latency_ms": rerank_latency, - "top_5": [r.path.split("\\")[-1] for r in fused_results[:5]], - "scores": [r.score for r in fused_results[:5]] - } - except Exception as e: - query_result["strategies"]["fts_rerank_fusion"] = {"error": str(e)} - - # Compute overlap between strategies - if ( - "error" not in query_result["strategies"].get("standard_hybrid", {}) - and "error" not in query_result["strategies"].get("fts_rerank_fusion", {}) - ): - standard_paths = set(r.path.split("\\")[-1] for r in standard_results[:10]) - fts_rerank_paths = set(r.path.split("\\")[-1] for r in fused_results[:10]) - - overlap = len(standard_paths & fts_rerank_paths) - query_result["comparison"] = { - "top_10_overlap": overlap, - "standard_unique": list(standard_paths - fts_rerank_paths)[:3], - "fts_rerank_unique": list(fts_rerank_paths - standard_paths)[:3] - } - - results["per_query"].append(query_result) - - return results - - -def main(): - """Run all analyses.""" - source_path = Path("D:/Claude_dms3/codex-lens/src") - index_path = find_project_index(source_path) - - print(f"Using index: {index_path}") - print(f"Index exists: {index_path.exists()}") - print() - - # Test queries - queries = [ - "binary quantization", - "hamming distance search", - "embeddings generation", - "reranking algorithm", - "database connection handling", - ] - - # 1. Storage Architecture Analysis - print("=" * 60) - print("1. STORAGE ARCHITECTURE ANALYSIS") - print("=" * 60) - - storage_analysis = analyze_storage_architecture(index_path) - - print("\nTable Overview:") - for table, info in sorted(storage_analysis["tables"].items()): - if "row_count" in info: - print(f" {table}: {info['row_count']} rows") - - print("\nConflicts Detected:") - for conflict in storage_analysis["conflicts"]: - print(f" - {conflict['description']}") - - print("\nRecommendations:") - for rec in storage_analysis["recommendations"]: - print(f" - {rec}") - - # 2. Method Contribution Analysis - print("\n" + "=" * 60) - print("2. METHOD CONTRIBUTION ANALYSIS") - print("=" * 60) - - contribution_analysis = analyze_method_contributions(index_path, queries) - - print("\nPer-Query Results:") - for qr in contribution_analysis["per_query"]: - print(f"\n Query: '{qr['query']}'") - for method, data in qr["methods"].items(): - if "error" not in data: - print(f" {method}: {data['count']} results, {data['latency_ms']:.1f}ms") - if data.get("top_3_files"): - print(f" Top 3: {', '.join(data['top_3_files'])}") - - if qr.get("overlaps"): - print(" Overlaps:") - for pair, info in qr["overlaps"].items(): - print(f" {pair}: {info['overlap_count']} common (Jaccard: {info['jaccard']:.2f})") - - print("\nSummary:") - for method, stats in contribution_analysis["summary"].items(): - print(f" {method}: avg {stats['avg_count']:.1f} results, {stats['avg_latency_ms']:.1f}ms") - - # 3. FTS + Rerank Fusion Experiment - print("\n" + "=" * 60) - print("3. FTS + RERANK FUSION EXPERIMENT") - print("=" * 60) - - fusion_experiment = experiment_fts_rerank_fusion(index_path, queries) - - print("\nPer-Query Comparison:") - for qr in fusion_experiment["per_query"]: - print(f"\n Query: '{qr['query']}'") - for strategy, data in qr["strategies"].items(): - if "error" not in data: - latency = data.get("total_latency_ms") or data.get("latency_ms", 0) - print(f" {strategy}: {data['count']} results, {latency:.1f}ms") - if data.get("top_5"): - print(f" Top 5: {', '.join(data['top_5'][:3])}...") - - if qr.get("comparison"): - comp = qr["comparison"] - print(f" Top-10 Overlap: {comp['top_10_overlap']}/10") - - # Save full results - output_path = Path(__file__).parent / "results" / "method_contribution_analysis.json" - output_path.parent.mkdir(exist_ok=True) - - full_results = { - "storage_analysis": storage_analysis, - "contribution_analysis": contribution_analysis, - "fusion_experiment": fusion_experiment - } - - with open(output_path, "w", encoding="utf-8") as f: - json.dump(full_results, f, indent=2, default=str) - - print(f"\n\nFull results saved to: {output_path}") - - -if __name__ == "__main__": - main() diff --git a/codex-lens/benchmarks/results/accuracy_2026-02-11_codexlens.json b/codex-lens/benchmarks/results/accuracy_2026-02-11_codexlens.json deleted file mode 100644 index 3d2fa958..00000000 --- a/codex-lens/benchmarks/results/accuracy_2026-02-11_codexlens.json +++ /dev/null @@ -1,1308 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-11 16:33:19", - "source": "D:\\Claude_dms3\\codex-lens\\src", - "queries_file": "codex-lens\\benchmarks\\accuracy_queries_codexlens.jsonl", - "query_count": 33, - "k": 10, - "coarse_k": 100, - "staged": { - "hit_at_k": 0.7575757575757576, - "mrr_at_k": 0.5833333333333334, - "avg_recall_at_k": 0.7424242424242424, - "avg_latency_ms": 4635.525263649045, - "errors": 0 - }, - "dense_rerank": { - "hit_at_k": 0.21212121212121213, - "mrr_at_k": 0.06227753727753728, - "avg_recall_at_k": 0.21212121212121213, - "avg_latency_ms": 2597.3116121219864, - "errors": 0 - }, - "config": { - "staged_stage2_mode": "realtime", - "staged_clustering_strategy": "path", - "enable_staged_rerank": true, - "reranker_backend": "api", - "reranker_model": "Qwen/Qwen3-Reranker-8B", - "embedding_backend": "litellm", - "embedding_model": "qwen3-embedding-sf" - } - }, - "evaluations": [ - { - "query": "class StandaloneLspManager", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 25897.209399938583, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\association_tree\\builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "first_hit_rank": 2, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 3059.8712000250816, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def _open_document", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 8377.355100035667, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2697.353200018406, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\output.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def _read_message", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4349.869300067425, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2418.672600030899, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "how does textDocument/didOpen work", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 321.56859999895096, - "topk_paths": [], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2452.267899990082, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "class LspBridge", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4634.055300056934, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py" - ], - "first_hit_rank": 3, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2672.7246000170708, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def get_document_symbols", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4454.471000015736, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2581.881399989128, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "class KeepAliveLspBridge", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 346.4588000178337, - "topk_paths": [], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2581.1541000008583, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "LSP keepalive bridge", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 319.3557000756264, - "topk_paths": [], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2587.464199960232, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "class LspGraphBuilder", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 5038.322200000286, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py" - ], - "first_hit_rank": 2, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 3435.6180000305176, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\splade_encoder.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def build_from_seeds", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4622.321400046349, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "first_hit_rank": 2, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2633.6710000038147, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def _stage2_realtime_lsp_expand", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 333.375500023365, - "topk_paths": [], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2626.7274000048637, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\merkle_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py" - ], - "first_hit_rank": 3, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - } - }, - { - "query": "def _stage3_cluster_prune", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4407.406300008297, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2711.8762999773026, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py" - ], - "first_hit_rank": 7, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - } - }, - { - "query": "def _cross_encoder_rerank", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4716.0983999967575, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "first_hit_rank": 3, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2612.2980999946594, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py" - ], - "first_hit_rank": 9, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - } - }, - { - "query": "def dense_rerank_cascade_search", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4556.352999985218, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2435.9282999634743, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def cascade_search", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4779.43700003624, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2462.476100027561, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def _find_nearest_binary_mmap_root", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 384.8026000261307, - "topk_paths": [], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2374.5640999674797, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\litellm_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "first_hit_rank": 5, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - } - }, - { - "query": "class BinarySearcher", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\binary_searcher.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4964.564999938011, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\binary_searcher.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2740.684500038624, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "class GraphExpander", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4982.367900013924, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2916.1848999857903, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def cross_encoder_rerank", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4503.571500003338, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2504.1979999542236, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def group_similar_results", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4660.934600055218, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py" - ], - "first_hit_rank": 2, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2379.2526000142097, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py" - ], - "first_hit_rank": 7, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - } - }, - { - "query": "class ConfigError", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\errors.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4616.049799978733, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\errors.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py" - ], - "first_hit_rank": 3, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2418.3816999793053, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def load_settings", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4625.254700064659, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2702.3474999070168, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "BINARY_VECTORS_MMAP_NAME", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4285.477600038052, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py" - ], - "first_hit_rank": 2, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2406.369700014591, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "STAGED_CLUSTERING_STRATEGY", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\env_config.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4307.972999989986, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py" - ], - "first_hit_rank": 2, - "hit_at_k": true, - "recall_at_k": 0.5, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2634.202399969101, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\merkle_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def apply_workspace_env", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\env_config.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4182.440200030804, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2486.3993000388145, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\litellm_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def generate_env_example", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\env_config.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4413.619400024414, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\env_config.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2556.517999947071, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def get_reranker", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\factory.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4670.021300017834, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\model_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2631.054200053215, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\merkle_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "class APIReranker", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4536.27840000391, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2502.5143000483513, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "class RegistryStore", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 6543.249599993229, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\mcp\\provider.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\providers.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\hover.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py" - ], - "first_hit_rank": 4, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2509.7423000335693, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py" - ], - "first_hit_rank": 8, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - } - }, - { - "query": "class PathMapper", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4577.398099958897, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2493.4598000645638, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def lsp_status", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 352.2480999827385, - "topk_paths": [], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2516.1266999840736, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - } - }, - { - "query": "graph_neighbors migration", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4542.888000011444, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2324.4544000029564, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def get_model_config", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 4669.536899983883, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2644.8443999886513, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\output.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\merkle_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/accuracy_2026-02-11_codexlens_precomputed.json b/codex-lens/benchmarks/results/accuracy_2026-02-11_codexlens_precomputed.json deleted file mode 100644 index 3c6d9472..00000000 --- a/codex-lens/benchmarks/results/accuracy_2026-02-11_codexlens_precomputed.json +++ /dev/null @@ -1,1335 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-11 17:39:54", - "source": "D:\\Claude_dms3\\codex-lens\\src", - "queries_file": "codex-lens\\benchmarks\\accuracy_queries_codexlens.jsonl", - "query_count": 33, - "k": 10, - "coarse_k": 100, - "staged": { - "hit_at_k": 0.7575757575757576, - "mrr_at_k": 0.5883838383838383, - "avg_recall_at_k": 0.7424242424242424, - "avg_latency_ms": 2331.3277969649344, - "errors": 0 - }, - "dense_rerank": { - "hit_at_k": 0.21212121212121213, - "mrr_at_k": 0.06227753727753728, - "avg_recall_at_k": 0.21212121212121213, - "avg_latency_ms": 2182.33056061015, - "errors": 0 - }, - "config": { - "staged_stage2_mode": "precomputed", - "staged_clustering_strategy": "path", - "enable_staged_rerank": true, - "reranker_backend": "api", - "reranker_model": "Qwen/Qwen3-Reranker-8B", - "embedding_backend": "litellm", - "embedding_model": "qwen3-embedding-sf" - } - }, - "evaluations": [ - { - "query": "class StandaloneLspManager", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 19341.994099974632, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\association_tree\\builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py" - ], - "first_hit_rank": 2, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2999.929000020027, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def _open_document", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 2847.462099969387, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2615.54029995203, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\output.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def _read_message", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 2385.6554000377655, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2581.8080000281334, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "how does textDocument/didOpen work", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 361.7903000116348, - "topk_paths": [], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2404.24530005455, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "class LspBridge", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 2457.195499956608, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "first_hit_rank": 3, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2546.2164999842644, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def get_document_symbols", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 2532.4168999791145, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2429.6208000183105, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "class KeepAliveLspBridge", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 398.90080004930496, - "topk_paths": [], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2454.2164999842644, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "LSP keepalive bridge", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 330.90090000629425, - "topk_paths": [], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2490.4245000481606, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "class LspGraphBuilder", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 2399.8781000375748, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py" - ], - "first_hit_rank": 2, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2402.9406000375748, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\splade_encoder.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def build_from_seeds", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 3348.9842999577522, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py" - ], - "first_hit_rank": 2, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2525.5670999884605, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def _stage2_realtime_lsp_expand", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 329.77999997138977, - "topk_paths": [], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2464.8422999978065, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\merkle_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py" - ], - "first_hit_rank": 3, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - } - }, - { - "query": "def _stage3_cluster_prune", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 2466.0647000670433, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\binary_searcher.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2633.537499964237, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py" - ], - "first_hit_rank": 7, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - } - }, - { - "query": "def _cross_encoder_rerank", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 2565.2637000083923, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\binary_searcher.py" - ], - "first_hit_rank": 3, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2483.7863000035286, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py" - ], - "first_hit_rank": 9, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - } - }, - { - "query": "def dense_rerank_cascade_search", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 1953.4079999923706, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\binary_searcher.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 1758.5974999666214, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def cascade_search", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 2054.1276000142097, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\utils.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\models.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\symbols.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 1729.1329000592232, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def _find_nearest_binary_mmap_root", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 209.5627999305725, - "topk_paths": [], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 1902.3523000478745, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\litellm_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "first_hit_rank": 5, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - } - }, - { - "query": "class BinarySearcher", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\binary_searcher.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 2291.7905999422073, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\binary_searcher.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 1719.2722999453545, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "class GraphExpander", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 1672.2199999690056, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 1739.1129999756813, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def cross_encoder_rerank", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 1548.37600004673, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 1962.3666999936104, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def group_similar_results", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 1733.5452999472618, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\binary_searcher.py" - ], - "first_hit_rank": 2, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 1727.5000000596046, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py" - ], - "first_hit_rank": 7, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - } - }, - { - "query": "class ConfigError", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\errors.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 1806.7660999894142, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\errors.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py" - ], - "first_hit_rank": 2, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 1788.8945000171661, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def load_settings", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 2616.400499999523, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\output.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 1680.113300025463, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "BINARY_VECTORS_MMAP_NAME", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 1801.7208999991417, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\output.py" - ], - "first_hit_rank": 2, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 1675.2271999716759, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "STAGED_CLUSTERING_STRATEGY", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\env_config.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 1470.9057000279427, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py" - ], - "first_hit_rank": 2, - "hit_at_k": true, - "recall_at_k": 0.5, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 1803.0420999526978, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\merkle_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def apply_workspace_env", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\env_config.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 1371.6070999503136, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 1817.1355000138283, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\litellm_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def generate_env_example", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\env_config.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 1465.9499000310898, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\env_config.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 1701.9165999889374, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def get_reranker", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\factory.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 1902.2649999856949, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\model_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\litellm_reranker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\fastembed_reranker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\legacy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\binary_searcher.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 1746.6025000214577, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\merkle_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "class APIReranker", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 2245.715800046921, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\base.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\legacy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\fastembed_reranker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\litellm_reranker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\onnx_reranker.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 1659.7105000019073, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "class RegistryStore", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 1702.458899974823, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\mcp\\provider.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\providers.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\hover.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py" - ], - "first_hit_rank": 4, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 3514.6511999964714, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py" - ], - "first_hit_rank": 8, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - } - }, - { - "query": "class PathMapper", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 1793.6620999574661, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 1708.0654000639915, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def lsp_status", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 180.50510001182556, - "topk_paths": [], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 1799.0735999941826, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - } - }, - { - "query": "graph_neighbors migration", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 2524.9900000095367, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2674.021600008011, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - }, - { - "query": "def get_model_config", - "relevant_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "staged": { - "strategy": "staged", - "latency_ms": 2821.553099989891, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\output.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\binary_searcher.py" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "latency_ms": 2877.4450999498367, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\output.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\merkle_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/cascade_benchmark.json b/codex-lens/benchmarks/results/cascade_benchmark.json deleted file mode 100644 index e8178395..00000000 --- a/codex-lens/benchmarks/results/cascade_benchmark.json +++ /dev/null @@ -1,277 +0,0 @@ -{ - "timestamp": "2026-01-02 11:48:33", - "summaries": { - "binary": { - "strategy": "binary", - "total_queries": 15, - "successful_queries": 15, - "avg_latency_ms": 1133.4008666667312, - "min_latency_ms": 959.5361000028788, - "max_latency_ms": 1330.8978999993997, - "p50_latency_ms": 1125.8439999946859, - "p95_latency_ms": 1330.0081999987015, - "p99_latency_ms": 1330.71995999926, - "avg_results": 10, - "errors": [] - }, - "hybrid": { - "strategy": "hybrid", - "total_queries": 15, - "successful_queries": 15, - "avg_latency_ms": 1111.1401133336283, - "min_latency_ms": 857.0021999985329, - "max_latency_ms": 1278.8890000010724, - "p50_latency_ms": 1130.696000000171, - "p95_latency_ms": 1254.2417899981956, - "p99_latency_ms": 1273.959558000497, - "avg_results": 10, - "errors": [] - } - }, - "details": { - "binary": [ - { - "strategy": "binary", - "query": "def search", - "latency_ms": 1044.525999997859, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py:0", - "error": null - }, - { - "strategy": "binary", - "query": "class Engine", - "latency_ms": 1052.5979999947594, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py:0", - "error": null - }, - { - "strategy": "binary", - "query": "import numpy", - "latency_ms": 1217.217100005655, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\__main__.py:0", - "error": null - }, - { - "strategy": "binary", - "query": "async def", - "latency_ms": 1276.9802000038908, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py:0", - "error": null - }, - { - "strategy": "binary", - "query": "raise ValueError", - "latency_ms": 1005.9053000004496, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py:0", - "error": null - }, - { - "strategy": "binary", - "query": "how to parse json", - "latency_ms": 1330.8978999993997, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py:0", - "error": null - }, - { - "strategy": "binary", - "query": "database connection", - "latency_ms": 1041.6685000018333, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py:0", - "error": null - }, - { - "strategy": "binary", - "query": "error handling", - "latency_ms": 959.5361000028788, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_004_dual_fts.py:0", - "error": null - }, - { - "strategy": "binary", - "query": "authentication logic", - "latency_ms": 1060.9395999999833, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py:0", - "error": null - }, - { - "strategy": "binary", - "query": "file read write", - "latency_ms": 971.8680000005406, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py:0", - "error": null - }, - { - "strategy": "binary", - "query": "embedding vector", - "latency_ms": 1135.879900000873, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\embedder.py:0", - "error": null - }, - { - "strategy": "binary", - "query": "cosine similarity", - "latency_ms": 1188.1732000038028, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py:0", - "error": null - }, - { - "strategy": "binary", - "query": "binary quantization", - "latency_ms": 1259.3522999959532, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py:0", - "error": null - }, - { - "strategy": "binary", - "query": "hamming distance", - "latency_ms": 1329.6268999984022, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py:0", - "error": null - }, - { - "strategy": "binary", - "query": "reranking", - "latency_ms": 1125.8439999946859, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py:0", - "error": null - } - ], - "hybrid": [ - { - "strategy": "hybrid", - "query": "def search", - "latency_ms": 1117.0937999995658, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py:0", - "error": null - }, - { - "strategy": "hybrid", - "query": "class Engine", - "latency_ms": 1039.3984000038472, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py:0", - "error": null - }, - { - "strategy": "hybrid", - "query": "import numpy", - "latency_ms": 1144.7916999968584, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\__main__.py:0", - "error": null - }, - { - "strategy": "hybrid", - "query": "async def", - "latency_ms": 857.0021999985329, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py:0", - "error": null - }, - { - "strategy": "hybrid", - "query": "raise ValueError", - "latency_ms": 957.5578000003588, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py:0", - "error": null - }, - { - "strategy": "hybrid", - "query": "how to parse json", - "latency_ms": 1216.5708000029554, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py:0", - "error": null - }, - { - "strategy": "hybrid", - "query": "database connection", - "latency_ms": 1154.8929000055068, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py:0", - "error": null - }, - { - "strategy": "hybrid", - "query": "error handling", - "latency_ms": 1130.696000000171, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_004_dual_fts.py:0", - "error": null - }, - { - "strategy": "hybrid", - "query": "authentication logic", - "latency_ms": 1112.8943000003346, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py:0", - "error": null - }, - { - "strategy": "hybrid", - "query": "file read write", - "latency_ms": 1172.5986000019475, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py:0", - "error": null - }, - { - "strategy": "hybrid", - "query": "embedding vector", - "latency_ms": 1278.8890000010724, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\embedder.py:0", - "error": null - }, - { - "strategy": "hybrid", - "query": "cosine similarity", - "latency_ms": 1024.2393000007723, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py:0", - "error": null - }, - { - "strategy": "hybrid", - "query": "binary quantization", - "latency_ms": 1243.6786999969627, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py:0", - "error": null - }, - { - "strategy": "hybrid", - "query": "hamming distance", - "latency_ms": 1081.3100999948801, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py:0", - "error": null - }, - { - "strategy": "hybrid", - "query": "reranking", - "latency_ms": 1135.4881000006571, - "num_results": 10, - "top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py:0", - "error": null - } - ] - } -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/ccw_smart_search_stage2.json b/codex-lens/benchmarks/results/ccw_smart_search_stage2.json deleted file mode 100644 index 418bac3e..00000000 --- a/codex-lens/benchmarks/results/ccw_smart_search_stage2.json +++ /dev/null @@ -1,1704 +0,0 @@ -{ - "timestamp": "2026-03-12 15:52:13", - "source": "D:\\Claude_dms3", - "queries_file": "D:\\Claude_dms3\\codex-lens\\benchmarks\\accuracy_queries_ccw_smart_search.jsonl", - "query_count": 16, - "k": 10, - "coarse_k": 100, - "local_only": true, - "strategies": { - "dense_rerank": { - "query_count": 16, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 2493.8517937501892, - "p50_latency_ms": 2304.0422499999404, - "p95_latency_ms": 4031.03429999575, - "errors": 0, - "strategy": "dense_rerank", - "stage2_mode": null - }, - "staged:precomputed": { - "query_count": 16, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 2238.0576249985024, - "p50_latency_ms": 1962.1620500013232, - "p95_latency_ms": 3110.8512249961495, - "errors": 0, - "strategy": "staged", - "stage2_mode": "precomputed" - }, - "staged:realtime": { - "query_count": 16, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 10686.986462499015, - "p50_latency_ms": 7027.59129999578, - "p95_latency_ms": 28732.387600000948, - "errors": 0, - "strategy": "staged", - "stage2_mode": "realtime" - }, - "staged:static_global_graph": { - "query_count": 16, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 2284.2186249988154, - "p50_latency_ms": 2174.274800002575, - "p95_latency_ms": 3254.683274999261, - "errors": 0, - "strategy": "staged", - "stage2_mode": "static_global_graph" - } - }, - "stage2_mode_matrix": { - "precomputed": { - "query_count": 16, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 2238.0576249985024, - "p50_latency_ms": 1962.1620500013232, - "p95_latency_ms": 3110.8512249961495, - "errors": 0, - "strategy": "staged", - "stage2_mode": "precomputed" - }, - "realtime": { - "query_count": 16, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 10686.986462499015, - "p50_latency_ms": 7027.59129999578, - "p95_latency_ms": 28732.387600000948, - "errors": 0, - "strategy": "staged", - "stage2_mode": "realtime" - }, - "static_global_graph": { - "query_count": 16, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 2284.2186249988154, - "p50_latency_ms": 2174.274800002575, - "p95_latency_ms": 3254.683274999261, - "errors": 0, - "strategy": "staged", - "stage2_mode": "static_global_graph" - } - }, - "pairwise_stage2_deltas": [ - { - "mode_a": "precomputed", - "mode_b": "realtime", - "hit_at_k_delta": 0.0, - "mrr_at_k_delta": 0.0, - "avg_recall_at_k_delta": 0.0, - "avg_latency_ms_delta": -8448.928837500513 - }, - { - "mode_a": "precomputed", - "mode_b": "static_global_graph", - "hit_at_k_delta": 0.0, - "mrr_at_k_delta": 0.0, - "avg_recall_at_k_delta": 0.0, - "avg_latency_ms_delta": -46.161000000312924 - }, - { - "mode_a": "realtime", - "mode_b": "static_global_graph", - "hit_at_k_delta": 0.0, - "mrr_at_k_delta": 0.0, - "avg_recall_at_k_delta": 0.0, - "avg_latency_ms_delta": 8402.7678375002 - } - ], - "config": { - "embedding_backend": "fastembed", - "embedding_model": "code", - "embedding_use_gpu": false, - "reranker_backend": "onnx", - "reranker_model": "D:/Claude_dms3/codex-lens/.cache/huggingface/models/Xenova--ms-marco-MiniLM-L-6-v2", - "enable_staged_rerank": true, - "enable_cross_encoder_rerank": true - }, - "evaluations": [ - { - "query": "executeHybridMode dense_rerank semantic smart_search", - "intent": "ccw-semantic-routing", - "notes": "CCW semantic mode delegates to CodexLens dense_rerank.", - "relevant_paths": [ - "D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 5607.933899998665, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-validator.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\list.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\secret-redactor.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\health-check-service.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\view.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\api-key-tester.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 1853.0870999991894, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\api-key-tester.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\shell-escape.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\rate-limiter.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\commands-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\core-memory-store.js", - "d:\\claude_dms3\\ccw\\dist\\core\\core-memory-store.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 10468.899399995804, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\api-key-tester.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\shell-escape.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\rate-limiter.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\commands-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\core-memory-store.js", - "d:\\claude_dms3\\ccw\\dist\\core\\core-memory-store.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 1445.837599992752, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\api-key-tester.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\shell-escape.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\rate-limiter.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\commands-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\core-memory-store.js", - "d:\\claude_dms3\\ccw\\dist\\core\\core-memory-store.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - }, - { - "query": "parse CodexLens JSON output strip ANSI smart_search", - "intent": "ccw-json-fallback", - "notes": "Covers JSON/plain-text fallback handling for CodexLens output.", - "relevant_paths": [ - "D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 1518.7583000063896, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\utils\\secret-redactor.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\rules-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-context-builder.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\secret-redactor.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\cli.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\outline-queries.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\mcp-templates-db.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 1467.957000002265, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\outline-queries.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\loop.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\core\\memory-store.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\install.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\secret-redactor.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\rules-routes.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 35793.74619999528, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\outline-queries.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\loop.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\core\\memory-store.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\install.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\secret-redactor.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\rules-routes.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 2019.9724999964237, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\outline-queries.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\loop.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\core\\memory-store.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\install.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\secret-redactor.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\rules-routes.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - }, - { - "query": "smart_search init embed search action schema", - "intent": "ccw-action-schema", - "notes": "Find the Zod schema that defines init/embed/search actions.", - "relevant_paths": [ - "D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 2091.47919999063, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\outline-parser.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\shell-escape.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\rate-limiter.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\secret-redactor.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\file-reader.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\rules-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\secret-redactor.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 2017.3953999876976, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\utils\\secret-redactor.d.ts", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\team-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\serve.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\api-key-tester.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\outline-parser.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 2941.078400015831, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\utils\\secret-redactor.d.ts", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\team-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\serve.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\api-key-tester.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\outline-parser.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 1921.6328999996185, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\utils\\secret-redactor.d.ts", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\team-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\serve.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\api-key-tester.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\outline-parser.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - }, - { - "query": "auto init missing job dedupe smart_search", - "intent": "ccw-auto-init", - "notes": "Targets background init/embed warmup and dedupe state.", - "relevant_paths": [ - "D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 1662.2750000059605, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\docs-frontend.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js", - "d:\\claude_dms3\\ccw\\dist\\core\\cache-manager.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\dashboard-launcher.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\rules-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\react-frontend.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 1746.6091000139713, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\rules-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\react-frontend.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\team-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\server.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\cli-session-mux.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\cli.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\loop.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 6291.47570002079, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\rules-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\react-frontend.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\team-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\server.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\cli-session-mux.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\cli.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\loop.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 1718.0125000029802, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\rules-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\react-frontend.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\team-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\server.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\cli-session-mux.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\cli.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\loop.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - }, - { - "query": "smart_search exact mode fallback to CodexLens fts", - "intent": "ccw-exact-fallback", - "notes": "Tracks the exact-mode fallback path into CodexLens FTS.", - "relevant_paths": [ - "D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 1511.011400014162, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\files-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\codexlens-path.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\provider-routes.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\uv-manager.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-validator.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\outline-queries.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\secret-redactor.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 1897.7800999879837, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\files-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-validator.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-context-builder.js", - "d:\\claude_dms3\\ccw\\dist\\core\\server.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\install.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\codexlens-routes.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 6647.179499998689, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\files-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-validator.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-context-builder.js", - "d:\\claude_dms3\\ccw\\dist\\core\\server.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\install.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\codexlens-routes.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 2328.577100008726, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\files-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-validator.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-context-builder.js", - "d:\\claude_dms3\\ccw\\dist\\core\\server.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\install.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\codexlens-routes.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - }, - { - "query": "smart_search settings snapshot embedding backend reranker backend staged stage2 mode", - "intent": "ccw-config-snapshot", - "notes": "Reads local config snapshot for embedding/reranker/staged pipeline settings.", - "relevant_paths": [ - "D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 2516.6053000092506, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\rate-limiter.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\file-reader.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\docs-frontend.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\help-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\workflow.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\core-memory.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 2778.8519999980927, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\docs-frontend.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\core-memory.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\file-reader.js", - "d:\\claude_dms3\\ccw\\dist\\core\\websocket.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\config-backup.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\serve.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 4940.330799981952, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\docs-frontend.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\core-memory.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\file-reader.js", - "d:\\claude_dms3\\ccw\\dist\\core\\websocket.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\config-backup.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\serve.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 3191.194299995899, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\docs-frontend.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\core-memory.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\file-reader.js", - "d:\\claude_dms3\\ccw\\dist\\core\\websocket.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\config-backup.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\serve.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - }, - { - "query": "embedding backend fastembed local litellm api config", - "intent": "codexlens-embedding-config", - "notes": "Local-only benchmark should resolve to fastembed defaults.", - "relevant_paths": [ - "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\config.py" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 2773.382699996233, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\utils\\file-reader.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-context-builder.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\cli.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\files-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\outline-parser.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-resolver.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 2465.842600002885, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\files-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\file-reader.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-resolver.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-context-builder.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\cli.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\outline-parser.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 17898.587700009346, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\.venv\\lib\\site-packages\\sympy\\plotting\\backends\\base_backend.py", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\files-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\file-reader.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-resolver.js", - "d:\\claude_dms3\\ccw\\dist\\core\\pattern-detector.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-context-builder.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\cli.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 3331.694400012493, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\files-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\file-reader.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-resolver.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-context-builder.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\cli.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\outline-parser.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - }, - { - "query": "reranker backend onnx api legacy configuration", - "intent": "codexlens-reranker-config", - "notes": "Covers both config dataclass fields and env overrides.", - "relevant_paths": [ - "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\env_config.py" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 3433.85640001297, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\commands-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\react-frontend.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-context-builder.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\docs-frontend.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\uv-manager.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\issue.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\cli.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 2722.7298999875784, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\uv-manager.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\cli.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\commands-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\install.js", - "d:\\claude_dms3\\ccw\\dist\\core\\memory-store.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\react-frontend.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 6998.953399986029, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\.venv\\lib\\site-packages\\sympy\\plotting\\backends\\base_backend.py", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\uv-manager.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\cli.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\data-aggregator.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\install.js", - "d:\\claude_dms3\\ccw\\dist\\core\\memory-store.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 2707.838899999857, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\uv-manager.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\cli.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\commands-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\install.js", - "d:\\claude_dms3\\ccw\\dist\\core\\memory-store.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\react-frontend.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - }, - { - "query": "staged stage2 mode precomputed realtime static_global_graph", - "intent": "codexlens-stage2-config", - "notes": "Benchmark matrix should exercise the three supported stage2 modes.", - "relevant_paths": [ - "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\env_config.py" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 2557.460299998522, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\health-check-service.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\python-utils.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\outline-queries.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\codexlens-path.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-resolver.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 2611.47199998796, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\utils\\codexlens-path.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\api-key-tester.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\dashboard-generator.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\team.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\install.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\pending-question-service.d.ts", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 9986.3125, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\utils\\codexlens-path.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\api-key-tester.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\dashboard-generator.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\team.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\install.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\pending-question-service.d.ts", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 2705.1958999931812, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\utils\\codexlens-path.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\api-key-tester.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\dashboard-generator.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\team.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\install.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\pending-question-service.d.ts", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - }, - { - "query": "enable staged rerank stage 4 config", - "intent": "codexlens-stage4-rerank", - "notes": "Stage 4 rerank flag needs to stay enabled for local benchmarks.", - "relevant_paths": [ - "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\config.py" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 2839.552300006151, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\utils\\path-resolver.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\python-utils.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\health-check-service.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\package-discovery.d.ts", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\session-path-resolver.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\orchestrator-routes.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\data-aggregator.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 3044.0294999927282, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\utils\\package-discovery.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\ccw-routes.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\uv-manager.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\session-path-resolver.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\workflow.js", - "d:\\claude_dms3\\ccw\\dist\\core\\websocket.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 12196.75379998982, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\semantic\\reranker\\fastembed_reranker.py", - "d:\\claude_dms3\\ccw\\dist\\utils\\package-discovery.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\ccw-routes.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\session-path-resolver.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\workflow.js", - "d:\\claude_dms3\\ccw\\dist\\core\\websocket.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 2919.969099998474, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\utils\\package-discovery.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\ccw-routes.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\uv-manager.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\session-path-resolver.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\workflow.js", - "d:\\claude_dms3\\ccw\\dist\\core\\websocket.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - }, - { - "query": "cascade_search dense_rerank staged pipeline ChainSearchEngine", - "intent": "chain-search-cascade", - "notes": "Baseline query for the central retrieval engine.", - "relevant_paths": [ - "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 3082.173699989915, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\package-discovery.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\commands-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\health-check-service.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\dashboard-generator.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\rate-limiter.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 3012.5525999963284, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\commands-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\rate-limiter.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\core-memory-store.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\config-backup.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-validator.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\workflow.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\memory.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 10854.694199994206, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\commands-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\rate-limiter.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\core-memory-store.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\config-backup.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-validator.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\workflow.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\memory.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 3229.01289999485, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\commands-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\rate-limiter.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\core-memory-store.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\config-backup.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-validator.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\workflow.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\memory.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - }, - { - "query": "realtime LSP expand stage2 search pipeline", - "intent": "chain-search-stage2-realtime", - "notes": "Targets realtime stage2 expansion logic.", - "relevant_paths": [ - "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 3505.4010999947786, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\rules-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\outline-queries.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-resolver.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\files-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\memory-extraction-pipeline.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 3311.3164000064135, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-resolver.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\outline-parser.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\api-key-tester.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\dashboard-generator.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\core-memory.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 26378.601400002837, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\.venv\\lib\\site-packages\\optimum\\onnxruntime\\pipelines.py", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-resolver.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\health-check-service.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\outline-parser.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\api-key-tester.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\dashboard-generator.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\core-memory.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 2472.5419999957085, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-resolver.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\outline-parser.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\api-key-tester.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\dashboard-generator.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\core-memory.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - }, - { - "query": "static global graph stage2 expansion implementation", - "intent": "chain-search-stage2-static", - "notes": "Targets static_global_graph stage2 expansion logic.", - "relevant_paths": [ - "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 1676.1588000059128, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\core\\services\\health-check-service.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-resolver.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\loop.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\system-routes.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\serve.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\commands-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\install.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\team.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 1614.9786999970675, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\serve.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\commands-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\config-backup.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\health-check-service.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\docs-frontend.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\codexlens-path.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\memory-extraction-pipeline.js", - "d:\\claude_dms3\\ccw\\dist\\core\\pattern-detector.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 2153.07349999249, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\serve.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\commands-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\config-backup.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\health-check-service.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\docs-frontend.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\codexlens-path.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\memory-extraction-pipeline.js", - "d:\\claude_dms3\\ccw\\dist\\core\\pattern-detector.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 1658.4901999980211, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\serve.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\commands-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\config-backup.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\health-check-service.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\docs-frontend.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\codexlens-path.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\memory-extraction-pipeline.js", - "d:\\claude_dms3\\ccw\\dist\\core\\pattern-detector.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - }, - { - "query": "cross encoder rerank stage 4 implementation", - "intent": "chain-search-rerank", - "notes": "Relevant for dense_rerank and staged rerank latency comparisons.", - "relevant_paths": [ - "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 1556.9279999881983, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\claude-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\docs-frontend.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\uv-manager.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\package-discovery.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\commands-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\workflow.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\files-routes.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 1772.8751000016928, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\commands\\install.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\package-discovery.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\workflow.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\config-backup.js", - "d:\\claude_dms3\\ccw\\dist\\core\\server.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\core\\cache-manager.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 7056.229200005531, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\.venv\\lib\\site-packages\\fastembed\\rerank\\cross_encoder\\onnx_text_cross_encoder.py", - "d:\\claude_dms3\\ccw\\dist\\commands\\install.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\package-discovery.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\workflow.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\config-backup.js", - "d:\\claude_dms3\\ccw\\dist\\core\\server.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\core\\cache-manager.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 1721.4015000015497, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\commands\\install.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\package-discovery.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\workflow.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\config-backup.js", - "d:\\claude_dms3\\ccw\\dist\\core\\server.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\core\\cache-manager.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - }, - { - "query": "get_reranker factory onnx backend selection", - "intent": "reranker-factory", - "notes": "Keeps the benchmark aligned with local ONNX reranker selection.", - "relevant_paths": [ - "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\factory.py" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 2038.9054999947548, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\uninstall.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\docs-frontend.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\data-aggregator.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\uv-manager.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 1906.9287000149488, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\uv-manager.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\files-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\docs-frontend.js", - "d:\\claude_dms3\\ccw\\dist\\core\\websocket.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\loop.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\core\\websocket.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 4809.299199998379, - "topk_paths": [ - "d:\\claude_dms3\\.workflow\\.bench\\ccw-smart-search-mini-20260312\\codex-lens\\src\\codexlens\\semantic\\reranker\\factory.py", - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\uv-manager.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\docs-frontend.js", - "d:\\claude_dms3\\ccw\\dist\\core\\websocket.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\loop.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 1549.4464999884367, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\discovery-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\uv-manager.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\files-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\docs-frontend.js", - "d:\\claude_dms3\\ccw\\dist\\core\\websocket.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\loop.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\core\\websocket.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - }, - { - "query": "EMBEDDING_BACKEND and RERANKER_BACKEND environment variables", - "intent": "env-overrides", - "notes": "Covers CCW/CodexLens local-only environment overrides.", - "relevant_paths": [ - "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\env_config.py" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 1529.7467999905348, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\shell-escape.d.ts", - "d:\\claude_dms3\\ccw\\dist\\commands\\upgrade.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\react-frontend.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-resolver.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\files-routes.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\docs-frontend.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\codexlens-path.d.ts", - "d:\\claude_dms3\\ccw\\dist\\utils\\python-utils.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 1584.515799999237, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-resolver.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\files-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\workflow.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js", - "d:\\claude_dms3\\ccw\\dist\\core\\memory-job-scheduler.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\shell-escape.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 5576.568499997258, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-resolver.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\files-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\workflow.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js", - "d:\\claude_dms3\\ccw\\dist\\core\\memory-job-scheduler.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\shell-escape.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 1626.6797000020742, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\dist\\assets\\index-b4psv8bd.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\path-resolver.d.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\files-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\graph-routes.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\remote-notification-service.js", - "d:\\claude_dms3\\ccw\\dist\\core\\services\\flow-executor.js", - "d:\\claude_dms3\\ccw\\dist\\commands\\workflow.js", - "d:\\claude_dms3\\ccw\\dist\\core\\unified-memory-service.js", - "d:\\claude_dms3\\ccw\\dist\\core\\memory-job-scheduler.js", - "d:\\claude_dms3\\ccw\\dist\\utils\\shell-escape.d.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/ccw_smart_search_stage2_sample4_20260314.json b/codex-lens/benchmarks/results/ccw_smart_search_stage2_sample4_20260314.json deleted file mode 100644 index cb40f339..00000000 --- a/codex-lens/benchmarks/results/ccw_smart_search_stage2_sample4_20260314.json +++ /dev/null @@ -1,526 +0,0 @@ -{ - "timestamp": "2026-03-14 23:16:55", - "source": "D:\\Claude_dms3", - "queries_file": "D:\\Claude_dms3\\codex-lens\\benchmarks\\accuracy_queries_ccw_smart_search.jsonl", - "query_count": 4, - "k": 10, - "coarse_k": 100, - "local_only": true, - "strategies": { - "dense_rerank": { - "query_count": 4, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 20171.940174996853, - "p50_latency_ms": 14222.247749984264, - "p95_latency_ms": 35222.31535999476, - "errors": 0, - "strategy": "dense_rerank", - "stage2_mode": null - }, - "staged:precomputed": { - "query_count": 4, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 13679.793299987912, - "p50_latency_ms": 12918.63379997015, - "p95_latency_ms": 16434.964765003322, - "errors": 0, - "strategy": "staged", - "stage2_mode": "precomputed" - }, - "staged:realtime": { - "query_count": 4, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 13885.101849973202, - "p50_latency_ms": 13826.323699980974, - "p95_latency_ms": 14867.712269958853, - "errors": 0, - "strategy": "staged", - "stage2_mode": "realtime" - }, - "staged:static_global_graph": { - "query_count": 4, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 13336.124025002122, - "p50_latency_ms": 13415.476950019598, - "p95_latency_ms": 13514.329230004549, - "errors": 0, - "strategy": "staged", - "stage2_mode": "static_global_graph" - } - }, - "stage2_mode_matrix": { - "precomputed": { - "query_count": 4, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 13679.793299987912, - "p50_latency_ms": 12918.63379997015, - "p95_latency_ms": 16434.964765003322, - "errors": 0, - "strategy": "staged", - "stage2_mode": "precomputed" - }, - "realtime": { - "query_count": 4, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 13885.101849973202, - "p50_latency_ms": 13826.323699980974, - "p95_latency_ms": 14867.712269958853, - "errors": 0, - "strategy": "staged", - "stage2_mode": "realtime" - }, - "static_global_graph": { - "query_count": 4, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 13336.124025002122, - "p50_latency_ms": 13415.476950019598, - "p95_latency_ms": 13514.329230004549, - "errors": 0, - "strategy": "staged", - "stage2_mode": "static_global_graph" - } - }, - "pairwise_stage2_deltas": [ - { - "mode_a": "precomputed", - "mode_b": "realtime", - "hit_at_k_delta": 0.0, - "mrr_at_k_delta": 0.0, - "avg_recall_at_k_delta": 0.0, - "avg_latency_ms_delta": -205.30854998528957 - }, - { - "mode_a": "precomputed", - "mode_b": "static_global_graph", - "hit_at_k_delta": 0.0, - "mrr_at_k_delta": 0.0, - "avg_recall_at_k_delta": 0.0, - "avg_latency_ms_delta": 343.66927498579025 - }, - { - "mode_a": "realtime", - "mode_b": "static_global_graph", - "hit_at_k_delta": 0.0, - "mrr_at_k_delta": 0.0, - "avg_recall_at_k_delta": 0.0, - "avg_latency_ms_delta": 548.9778249710798 - } - ], - "config": { - "embedding_backend": "fastembed", - "embedding_model": "code", - "embedding_use_gpu": false, - "reranker_backend": "onnx", - "reranker_model": "cross-encoder/ms-marco-MiniLM-L-6-v2", - "enable_staged_rerank": true, - "enable_cross_encoder_rerank": true - }, - "evaluations": [ - { - "query": "executeHybridMode dense_rerank semantic smart_search", - "intent": "ccw-semantic-routing", - "notes": "CCW semantic mode delegates to CodexLens dense_rerank.", - "relevant_paths": [ - "D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 38829.27079999447, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\src\\core\\routes\\issue-routes.ts", - "d:\\claude_dms3\\ccw\\src\\tools\\session-manager.ts", - "d:\\claude_dms3\\ccw\\src\\types\\queue-types.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\nativesessionpanel.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts", - "d:\\claude_dms3\\ccw\\src\\core\\memory-extraction-pipeline.ts", - "d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\skills-page.spec.ts", - "d:\\claude_dms3\\ccw\\dist\\tools\\discover-design-files.js", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx", - "d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 16915.833400011063, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts", - "d:\\claude_dms3\\ccw\\src\\commands\\memory.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts", - "d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js", - "d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 13961.2567999959, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts", - "d:\\claude_dms3\\ccw\\src\\commands\\memory.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts", - "d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js", - "d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 12986.330999970436, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts", - "d:\\claude_dms3\\ccw\\src\\commands\\memory.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts", - "d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js", - "d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - }, - { - "query": "parse CodexLens JSON output strip ANSI smart_search", - "intent": "ccw-json-fallback", - "notes": "Covers JSON/plain-text fallback handling for CodexLens output.", - "relevant_paths": [ - "D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 14782.901199996471, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\src\\tools\\codex-lens-lsp.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\issue\\queue\\queueexecuteinsession.tsx", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\terminal-dashboard\\queuepanel.tsx", - "d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\usewebsocket.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useflows.ts", - "d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-error-monitoring.spec.ts", - "d:\\claude_dms3\\ccw\\tests\\native-session-discovery.test.ts", - "d:\\claude_dms3\\ccw\\src\\core\\services\\checkpoint-service.ts", - "d:\\claude_dms3\\ccw\\tests\\integration\\system-routes.test.ts", - "d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 13710.042499959469, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\userealtimeupdates.ts", - "d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-routes.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\stores\\queueexecutionstore.ts", - "d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\lib\\themeshare.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\clistreampanel.tsx", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\terminal-panel\\queueexecutionlistview.tsx", - "d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\test\\i18n.tsx", - "d:\\claude_dms3\\ccw\\dist\\core\\history-importer.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 15027.674999952316, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\userealtimeupdates.ts", - "d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-routes.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\stores\\queueexecutionstore.ts", - "d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\lib\\themeshare.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\clistreampanel.tsx", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\terminal-panel\\queueexecutionlistview.tsx", - "d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\test\\i18n.tsx", - "d:\\claude_dms3\\ccw\\dist\\core\\history-importer.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 13389.622500002384, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\userealtimeupdates.ts", - "d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-routes.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\stores\\queueexecutionstore.ts", - "d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\lib\\themeshare.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\clistreampanel.tsx", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\terminal-panel\\queueexecutionlistview.tsx", - "d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\test\\i18n.tsx", - "d:\\claude_dms3\\ccw\\dist\\core\\history-importer.js" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - }, - { - "query": "smart_search init embed search action schema", - "intent": "ccw-action-schema", - "notes": "Find the Zod schema that defines init/embed/search actions.", - "relevant_paths": [ - "D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 13661.594299972057, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\src\\tools\\ask-question.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\a2ui\\a2uipopupcard.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\routes\\discovery-routes.ts", - "d:\\claude_dms3\\ccw\\src\\core\\a2ui\\a2uiwebsockethandler.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx", - "d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\discovery.spec.ts", - "d:\\claude_dms3\\ccw\\src\\tools\\__tests__\\ask-question.test.ts", - "d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\a2ui\\a2uiwebsockethandler.js", - "d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\dashboard.spec.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 12127.225099980831, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts", - "d:\\claude_dms3\\ccw\\src\\core\\lite-scanner-complete.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\themeselector.tsx", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\team\\teamheader.tsx", - "d:\\claude_dms3\\ccw\\src\\tools\\ask-question.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\a2ui\\a2uipopupcard.tsx", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\issue\\discovery\\findinglist.tsx", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\routes\\discovery-routes.ts", - "d:\\claude_dms3\\ccw\\src\\core\\a2ui\\a2uiwebsockethandler.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 12860.084999978542, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts", - "d:\\claude_dms3\\ccw\\src\\core\\lite-scanner-complete.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\themeselector.tsx", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\team\\teamheader.tsx", - "d:\\claude_dms3\\ccw\\src\\tools\\ask-question.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\a2ui\\a2uipopupcard.tsx", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\issue\\discovery\\findinglist.tsx", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\routes\\discovery-routes.ts", - "d:\\claude_dms3\\ccw\\src\\core\\a2ui\\a2uiwebsockethandler.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 13441.331400036812, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts", - "d:\\claude_dms3\\ccw\\src\\core\\lite-scanner-complete.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\themeselector.tsx", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\team\\teamheader.tsx", - "d:\\claude_dms3\\ccw\\src\\tools\\ask-question.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\a2ui\\a2uipopupcard.tsx", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\issue\\discovery\\findinglist.tsx", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\routes\\discovery-routes.ts", - "d:\\claude_dms3\\ccw\\src\\core\\a2ui\\a2uiwebsockethandler.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - }, - { - "query": "auto init missing job dedupe smart_search", - "intent": "ccw-auto-init", - "notes": "Targets background init/embed warmup and dedupe state.", - "relevant_paths": [ - "D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts" - ], - "runs": { - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "latency_ms": 13413.994400024414, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\frontend\\src\\pages\\memorypage.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\routes\\memory-routes.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\usememory.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\batchoperationtoolbar.tsx", - "d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\memory.spec.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useprompthistory.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\stores\\flowstore.ts", - "d:\\claude_dms3\\ccw\\src\\services\\deepwiki-service.ts", - "d:\\claude_dms3\\ccw\\src\\core\\routes\\claude-routes.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "latency_ms": 11966.072200000286, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\src\\commands\\memory.ts", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\handlers.py", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\ui\\commandcombobox.tsx", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\global_graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\definition.py", - "d:\\claude_dms3\\ccw\\frontend\\src\\orchestrator\\orchestrationplanbuilder.ts", - "d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\lsp\\handlers.py", - "d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\search\\global_graph_expander.py", - "d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\api\\definition.py", - "d:\\claude_dms3\\ccw\\frontend\\src\\pages\\memorypage.tsx" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "latency_ms": 13691.39059996605, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\src\\commands\\memory.ts", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\handlers.py", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\ui\\commandcombobox.tsx", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\global_graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\definition.py", - "d:\\claude_dms3\\ccw\\frontend\\src\\orchestrator\\orchestrationplanbuilder.ts", - "d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\lsp\\handlers.py", - "d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\search\\global_graph_expander.py", - "d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\api\\definition.py", - "d:\\claude_dms3\\ccw\\frontend\\src\\pages\\memorypage.tsx" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "latency_ms": 13527.211199998856, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\src\\commands\\memory.ts", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\handlers.py", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\ui\\commandcombobox.tsx", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\global_graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\definition.py", - "d:\\claude_dms3\\ccw\\frontend\\src\\orchestrator\\orchestrationplanbuilder.ts", - "d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\lsp\\handlers.py", - "d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\search\\global_graph_expander.py", - "d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\api\\definition.py", - "d:\\claude_dms3\\ccw\\frontend\\src\\pages\\memorypage.tsx" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "error": null - } - } - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/ccw_smart_search_stage2_smoke1_cpu_reranker_20260314.json b/codex-lens/benchmarks/results/ccw_smart_search_stage2_smoke1_cpu_reranker_20260314.json deleted file mode 100644 index a6f5dc8d..00000000 --- a/codex-lens/benchmarks/results/ccw_smart_search_stage2_smoke1_cpu_reranker_20260314.json +++ /dev/null @@ -1,415 +0,0 @@ -{ - "timestamp": "2026-03-15 00:19:16", - "source": "D:\\Claude_dms3", - "queries_file": "D:\\Claude_dms3\\codex-lens\\benchmarks\\accuracy_queries_ccw_smart_search.jsonl", - "query_count": 1, - "k": 10, - "coarse_k": 100, - "local_only": true, - "strategies": { - "auto": { - "query_count": 1, - "hit_at_k": 1.0, - "mrr_at_k": 1.0, - "avg_recall_at_k": 1.0, - "avg_latency_ms": 1377.3565999865532, - "p50_latency_ms": 1377.3565999865532, - "p95_latency_ms": 1377.3565999865532, - "avg_generated_artifact_count": 0.0, - "avg_test_file_count": 0.0, - "runs_with_generated_artifacts": 0, - "runs_with_test_files": 0, - "effective_methods": { - "fts": 1 - }, - "errors": 0, - "strategy": "auto", - "stage2_mode": null - }, - "fts": { - "query_count": 1, - "hit_at_k": 1.0, - "mrr_at_k": 1.0, - "avg_recall_at_k": 1.0, - "avg_latency_ms": 1460.0819000601768, - "p50_latency_ms": 1460.0819000601768, - "p95_latency_ms": 1460.0819000601768, - "avg_generated_artifact_count": 0.0, - "avg_test_file_count": 0.0, - "runs_with_generated_artifacts": 0, - "runs_with_test_files": 0, - "effective_methods": { - "fts": 1 - }, - "errors": 0, - "strategy": "fts", - "stage2_mode": null - }, - "hybrid": { - "query_count": 1, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 45991.74140000343, - "p50_latency_ms": 45991.74140000343, - "p95_latency_ms": 45991.74140000343, - "avg_generated_artifact_count": 0.0, - "avg_test_file_count": 0.0, - "runs_with_generated_artifacts": 0, - "runs_with_test_files": 0, - "effective_methods": { - "hybrid": 1 - }, - "errors": 0, - "strategy": "hybrid", - "stage2_mode": null - }, - "dense_rerank": { - "query_count": 1, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 22739.62610000372, - "p50_latency_ms": 22739.62610000372, - "p95_latency_ms": 22739.62610000372, - "avg_generated_artifact_count": 1.0, - "avg_test_file_count": 2.0, - "runs_with_generated_artifacts": 1, - "runs_with_test_files": 1, - "effective_methods": { - "dense_rerank": 1 - }, - "errors": 0, - "strategy": "dense_rerank", - "stage2_mode": null - }, - "staged:precomputed": { - "query_count": 1, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 14900.017599999905, - "p50_latency_ms": 14900.017599999905, - "p95_latency_ms": 14900.017599999905, - "avg_generated_artifact_count": 1.0, - "avg_test_file_count": 0.0, - "runs_with_generated_artifacts": 1, - "runs_with_test_files": 0, - "effective_methods": { - "staged": 1 - }, - "errors": 0, - "strategy": "staged", - "stage2_mode": "precomputed" - }, - "staged:realtime": { - "query_count": 1, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 14104.314599990845, - "p50_latency_ms": 14104.314599990845, - "p95_latency_ms": 14104.314599990845, - "avg_generated_artifact_count": 1.0, - "avg_test_file_count": 0.0, - "runs_with_generated_artifacts": 1, - "runs_with_test_files": 0, - "effective_methods": { - "staged": 1 - }, - "errors": 0, - "strategy": "staged", - "stage2_mode": "realtime" - }, - "staged:static_global_graph": { - "query_count": 1, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 11906.852500021458, - "p50_latency_ms": 11906.852500021458, - "p95_latency_ms": 11906.852500021458, - "avg_generated_artifact_count": 1.0, - "avg_test_file_count": 0.0, - "runs_with_generated_artifacts": 1, - "runs_with_test_files": 0, - "effective_methods": { - "staged": 1 - }, - "errors": 0, - "strategy": "staged", - "stage2_mode": "static_global_graph" - } - }, - "stage2_mode_matrix": { - "precomputed": { - "query_count": 1, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 14900.017599999905, - "p50_latency_ms": 14900.017599999905, - "p95_latency_ms": 14900.017599999905, - "avg_generated_artifact_count": 1.0, - "avg_test_file_count": 0.0, - "runs_with_generated_artifacts": 1, - "runs_with_test_files": 0, - "effective_methods": { - "staged": 1 - }, - "errors": 0, - "strategy": "staged", - "stage2_mode": "precomputed" - }, - "realtime": { - "query_count": 1, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 14104.314599990845, - "p50_latency_ms": 14104.314599990845, - "p95_latency_ms": 14104.314599990845, - "avg_generated_artifact_count": 1.0, - "avg_test_file_count": 0.0, - "runs_with_generated_artifacts": 1, - "runs_with_test_files": 0, - "effective_methods": { - "staged": 1 - }, - "errors": 0, - "strategy": "staged", - "stage2_mode": "realtime" - }, - "static_global_graph": { - "query_count": 1, - "hit_at_k": 0.0, - "mrr_at_k": 0.0, - "avg_recall_at_k": 0.0, - "avg_latency_ms": 11906.852500021458, - "p50_latency_ms": 11906.852500021458, - "p95_latency_ms": 11906.852500021458, - "avg_generated_artifact_count": 1.0, - "avg_test_file_count": 0.0, - "runs_with_generated_artifacts": 1, - "runs_with_test_files": 0, - "effective_methods": { - "staged": 1 - }, - "errors": 0, - "strategy": "staged", - "stage2_mode": "static_global_graph" - } - }, - "pairwise_stage2_deltas": [ - { - "mode_a": "precomputed", - "mode_b": "realtime", - "hit_at_k_delta": 0.0, - "mrr_at_k_delta": 0.0, - "avg_recall_at_k_delta": 0.0, - "avg_latency_ms_delta": 795.7030000090599 - }, - { - "mode_a": "precomputed", - "mode_b": "static_global_graph", - "hit_at_k_delta": 0.0, - "mrr_at_k_delta": 0.0, - "avg_recall_at_k_delta": 0.0, - "avg_latency_ms_delta": 2993.165099978447 - }, - { - "mode_a": "realtime", - "mode_b": "static_global_graph", - "hit_at_k_delta": 0.0, - "mrr_at_k_delta": 0.0, - "avg_recall_at_k_delta": 0.0, - "avg_latency_ms_delta": 2197.462099969387 - } - ], - "config": { - "embedding_backend": "fastembed", - "embedding_model": "code", - "embedding_use_gpu": false, - "reranker_backend": "onnx", - "reranker_model": "cross-encoder/ms-marco-MiniLM-L-6-v2", - "reranker_use_gpu": false, - "enable_staged_rerank": true, - "enable_cross_encoder_rerank": true - }, - "evaluations": [ - { - "query": "executeHybridMode dense_rerank semantic smart_search", - "intent": "ccw-semantic-routing", - "notes": "CCW semantic mode delegates to CodexLens dense_rerank.", - "relevant_paths": [ - "D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts" - ], - "runs": { - "auto": { - "strategy_key": "auto", - "strategy": "auto", - "stage2_mode": null, - "effective_method": "fts", - "execution_method": "fts", - "latency_ms": 1377.3565999865532, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\src\\tools\\smart-search.ts" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "generated_artifact_count": 0, - "test_file_count": 0, - "error": null - }, - "fts": { - "strategy_key": "fts", - "strategy": "fts", - "stage2_mode": null, - "effective_method": "fts", - "execution_method": "fts", - "latency_ms": 1460.0819000601768, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\src\\tools\\smart-search.ts" - ], - "first_hit_rank": 1, - "hit_at_k": true, - "recall_at_k": 1.0, - "generated_artifact_count": 0, - "test_file_count": 0, - "error": null - }, - "hybrid": { - "strategy_key": "hybrid", - "strategy": "hybrid", - "stage2_mode": null, - "effective_method": "hybrid", - "execution_method": "hybrid", - "latency_ms": 45991.74140000343, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\src\\config\\litellm-api-config-manager.ts", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py", - "d:\\claude_dms3\\ccw\\src\\commands\\core-memory.ts", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\scripts\\generate_embeddings.py", - "d:\\claude_dms3\\ccw\\src\\core\\routes\\notification-routes.ts", - "d:\\claude_dms3\\ccw\\src\\tools\\team-msg.ts", - "d:\\claude_dms3\\ccw\\src\\types\\remote-notification.ts", - "d:\\claude_dms3\\ccw\\src\\core\\memory-store.ts", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "generated_artifact_count": 0, - "test_file_count": 0, - "error": null - }, - "dense_rerank": { - "strategy_key": "dense_rerank", - "strategy": "dense_rerank", - "stage2_mode": null, - "effective_method": "dense_rerank", - "execution_method": "cascade", - "latency_ms": 22739.62610000372, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\src\\core\\routes\\issue-routes.ts", - "d:\\claude_dms3\\ccw\\src\\tools\\session-manager.ts", - "d:\\claude_dms3\\ccw\\src\\types\\queue-types.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\nativesessionpanel.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts", - "d:\\claude_dms3\\ccw\\src\\core\\memory-extraction-pipeline.ts", - "d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\skills-page.spec.ts", - "d:\\claude_dms3\\ccw\\dist\\tools\\discover-design-files.js", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx", - "d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "generated_artifact_count": 1, - "test_file_count": 2, - "error": null - }, - "staged:precomputed": { - "strategy_key": "staged:precomputed", - "strategy": "staged", - "stage2_mode": "precomputed", - "effective_method": "staged", - "execution_method": "cascade", - "latency_ms": 14900.017599999905, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts", - "d:\\claude_dms3\\ccw\\src\\commands\\memory.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts", - "d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js", - "d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "generated_artifact_count": 1, - "test_file_count": 0, - "error": null - }, - "staged:realtime": { - "strategy_key": "staged:realtime", - "strategy": "staged", - "stage2_mode": "realtime", - "effective_method": "staged", - "execution_method": "cascade", - "latency_ms": 14104.314599990845, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts", - "d:\\claude_dms3\\ccw\\src\\commands\\memory.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts", - "d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js", - "d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "generated_artifact_count": 1, - "test_file_count": 0, - "error": null - }, - "staged:static_global_graph": { - "strategy_key": "staged:static_global_graph", - "strategy": "staged", - "stage2_mode": "static_global_graph", - "effective_method": "staged", - "execution_method": "cascade", - "latency_ms": 11906.852500021458, - "topk_paths": [ - "d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts", - "d:\\claude_dms3\\ccw\\src\\commands\\memory.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts", - "d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx", - "d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts", - "d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts", - "d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js", - "d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts" - ], - "first_hit_rank": null, - "hit_at_k": false, - "recall_at_k": 0.0, - "generated_artifact_count": 1, - "test_file_count": 0, - "error": null - } - } - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/compare_2026-02-09.json b/codex-lens/benchmarks/results/compare_2026-02-09.json deleted file mode 100644 index c9dfd28a..00000000 --- a/codex-lens/benchmarks/results/compare_2026-02-09.json +++ /dev/null @@ -1,453 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-09 11:08:47", - "source": "src", - "k": 10, - "coarse_k": 100, - "query_count": 7, - "avg_jaccard_topk": 0.41421235160730957, - "avg_rbo_topk": 0.22899068093857142, - "staged": { - "success": 7, - "avg_latency_ms": 32009.68328570468 - }, - "dense_rerank": { - "success": 7, - "avg_latency_ms": 2783.3305999977247 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 40875.45489999652, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 10633.91399383545, - "stage2_expand_ms": 12487.980365753174, - "stage3_cluster_ms": 10781.587362289429, - "stage4_rerank_ms": 6914.837837219238 - }, - "stage_counts": { - "stage1_candidates": 100, - "stage2_expanded": 149, - "stage3_clustered": 20, - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 3111.874899983406, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.06741929885142856, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 8, - "dense_unique_dirs_topk": 4 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 38541.18510001898, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 548.8920211791992, - "stage2_expand_ms": 27176.724433898926, - "stage3_cluster_ms": 8352.917671203613, - "stage4_rerank_ms": 2392.6541805267334 - }, - "stage_counts": { - "stage1_candidates": 100, - "stage2_expanded": 101, - "stage3_clustered": 20, - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 2652.75, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.26666666666666666, - "rbo_topk": 0.2983708721671428, - "staged_unique_files_topk": 9, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 26319.983999997377, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\merkle_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 514.4834518432617, - "stage2_expand_ms": 14329.241514205933, - "stage3_cluster_ms": 9249.040842056274, - "stage4_rerank_ms": 2159.9059104919434 - }, - "stage_counts": { - "stage1_candidates": 100, - "stage2_expanded": 100, - "stage3_clustered": 20, - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 2666.9745999872684, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.6666666666666666, - "rbo_topk": 0.3571430355128571, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "graph expansion", - "staged": { - "strategy": "staged", - "query": "graph expansion", - "latency_ms": 25696.087299972773, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 560.4684352874756, - "stage2_expand_ms": 13951.441526412964, - "stage3_cluster_ms": 8879.387140274048, - "stage4_rerank_ms": 2229.4514179229736 - }, - "stage_counts": { - "stage1_candidates": 100, - "stage2_expanded": 100, - "stage3_clustered": 20, - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "graph expansion", - "latency_ms": 2544.8630999922752, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.42857142857142855, - "rbo_topk": 0.13728894791142857, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "clustering strategy", - "staged": { - "strategy": "staged", - "query": "clustering strategy", - "latency_ms": 27387.41929998994, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 625.0262260437012, - "stage2_expand_ms": 14211.347103118896, - "stage3_cluster_ms": 10269.58680152893, - "stage4_rerank_ms": 2208.007335662842 - }, - "stage_counts": { - "stage1_candidates": 100, - "stage2_expanded": 100, - "stage3_clustered": 20, - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "clustering strategy", - "latency_ms": 2928.22389999032, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.17647058823529413, - "rbo_topk": 0.07116480920571429, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "error handling", - "staged": { - "strategy": "staged", - "query": "error handling", - "latency_ms": 23732.33979997039, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 504.0884017944336, - "stage2_expand_ms": 12899.415016174316, - "stage3_cluster_ms": 7881.027936935425, - "stage4_rerank_ms": 2372.1535205841064 - }, - "stage_counts": { - "stage1_candidates": 100, - "stage2_expanded": 100, - "stage3_clustered": 20, - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "error handling", - "latency_ms": 2946.439900010824, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.6666666666666666, - "rbo_topk": 0.19158624676285715, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "how to parse json", - "staged": { - "strategy": "staged", - "query": "how to parse json", - "latency_ms": 41515.31259998679, - "num_results": 9, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 601.7005443572998, - "stage2_expand_ms": 30052.319765090942, - "stage3_cluster_ms": 8409.791231155396, - "stage4_rerank_ms": 2371.1729049682617 - }, - "stage_counts": { - "stage1_candidates": 100, - "stage2_expanded": 100, - "stage3_clustered": 20, - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "how to parse json", - "latency_ms": 2632.1878000199795, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.5833333333333334, - "rbo_topk": 0.4799615561585714, - "staged_unique_files_topk": 9, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/compare_2026-02-09_dir_rr_fast4.json b/codex-lens/benchmarks/results/compare_2026-02-09_dir_rr_fast4.json deleted file mode 100644 index 1b2aae3f..00000000 --- a/codex-lens/benchmarks/results/compare_2026-02-09_dir_rr_fast4.json +++ /dev/null @@ -1,356 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-09 20:37:28", - "source": "src", - "k": 10, - "coarse_k": 100, - "query_count": 7, - "avg_jaccard_topk": 0.12095811211246858, - "avg_rbo_topk": 0.09594444061244897, - "staged": { - "success": 7, - "avg_latency_ms": 2471.239057132176 - }, - "dense_rerank": { - "success": 7, - "avg_latency_ms": 3087.217985710927 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 312.2674999535084, - "num_results": 37, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\litellm_reranker.py" - ], - "stage_stats": null, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 2672.6916999816895, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.05263157894736842, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 15344.861499994993, - "num_results": 3, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 81.70747756958008, - "stage2_expand_ms": 12762.907266616821, - "stage3_cluster_ms": 0.0021457672119140625, - "stage4_rerank_ms": 2422.7287769317627 - }, - "stage_counts": { - "stage1_candidates": 3, - "stage2_expanded": 4, - "stage2_unique_paths": 3, - "stage2_duplicate_paths": 1, - "stage3_clustered": 4, - "stage3_strategy": "dir_rr", - "stage4_reranked": 4 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 2908.5530000030994, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.09090909090909091, - "rbo_topk": 0.23541639942571424, - "staged_unique_files_topk": 2, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 328.4989999830723, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "stage_stats": null, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 3426.8526000082493, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "graph expansion", - "staged": { - "strategy": "staged", - "query": "graph expansion", - "latency_ms": 359.32230001688004, - "num_results": 11, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\hybrid_search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "stage_stats": null, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "graph expansion", - "latency_ms": 3472.025099992752, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.17647058823529413, - "rbo_topk": 0.06801300374142856, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 7, - "dense_unique_dirs_topk": 4 - }, - { - "query": "clustering strategy", - "staged": { - "strategy": "staged", - "query": "clustering strategy", - "latency_ms": 289.3139999806881, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py" - ], - "stage_stats": null, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "clustering strategy", - "latency_ms": 2859.5299999713898, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.04670528456571428, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 4 - }, - { - "query": "error handling", - "staged": { - "strategy": "staged", - "query": "error handling", - "latency_ms": 305.66699999570847, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "stage_stats": null, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "error handling", - "latency_ms": 3101.3711999952793, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "how to parse json", - "staged": { - "strategy": "staged", - "query": "how to parse json", - "latency_ms": 358.74210000038147, - "num_results": 4, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "stage_stats": null, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "how to parse json", - "latency_ms": 3169.5023000240326, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.2727272727272727, - "rbo_topk": 0.18590219827714285, - "staged_unique_files_topk": 4, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 4 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/compare_2026-02-09_dir_rr_fast5.json b/codex-lens/benchmarks/results/compare_2026-02-09_dir_rr_fast5.json deleted file mode 100644 index 2d30d43e..00000000 --- a/codex-lens/benchmarks/results/compare_2026-02-09_dir_rr_fast5.json +++ /dev/null @@ -1,466 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-09 20:48:55", - "source": "src", - "k": 10, - "coarse_k": 100, - "query_count": 7, - "avg_jaccard_topk": 0.11418494830148965, - "avg_rbo_topk": 0.08910725003591835, - "staged": { - "success": 7, - "avg_latency_ms": 16443.109000005894 - }, - "dense_rerank": { - "success": 7, - "avg_latency_ms": 2919.481471432107 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 6056.956700026989, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 113.12270164489746, - "stage1_fallback_search_ms": 262.55249977111816, - "stage2_expand_ms": 3022.8426456451416, - "stage3_cluster_ms": 1.155853271484375, - "stage4_rerank_ms": 2554.953098297119 - }, - "stage_counts": { - "stage1_candidates": 37, - "stage1_fallback_used": 1, - "stage2_expanded": 86, - "stage2_unique_paths": 53, - "stage2_duplicate_paths": 33, - "stage3_clustered": 20, - "stage3_strategy": "dir_rr", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 2788.0383999943733, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.05263157894736842, - "rbo_topk": 0.014635885139999999, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 8, - "dense_unique_dirs_topk": 4 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 12229.477500021458, - "num_results": 3, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 108.82282257080078, - "stage2_expand_ms": 9422.304153442383, - "stage3_cluster_ms": 0.001430511474609375, - "stage4_rerank_ms": 2611.234664916992 - }, - "stage_counts": { - "stage1_candidates": 3, - "stage2_expanded": 4, - "stage2_unique_paths": 3, - "stage2_duplicate_paths": 1, - "stage3_clustered": 4, - "stage3_strategy": "dir_rr", - "stage4_reranked": 4 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 2823.377499997616, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.09090909090909091, - "rbo_topk": 0.23541639942571424, - "staged_unique_files_topk": 2, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 33805.434699982405, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 100.5556583404541, - "stage1_fallback_search_ms": 176.71489715576172, - "stage2_expand_ms": 31017.661809921265, - "stage3_cluster_ms": 0.001430511474609375, - "stage4_rerank_ms": 2403.3148288726807 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 5, - "stage2_unique_paths": 5, - "stage2_duplicate_paths": 0, - "stage3_clustered": 5, - "stage3_strategy": "dir_rr", - "stage4_reranked": 5 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 2906.127400010824, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "graph expansion", - "staged": { - "strategy": "staged", - "query": "graph expansion", - "latency_ms": 16790.213800013065, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\hybrid_search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 110.00967025756836, - "stage1_fallback_search_ms": 176.9556999206543, - "stage2_expand_ms": 13929.782629013062, - "stage3_cluster_ms": 0.45800209045410156, - "stage4_rerank_ms": 2486.6883754730225 - }, - "stage_counts": { - "stage1_candidates": 11, - "stage1_fallback_used": 1, - "stage2_expanded": 29, - "stage2_unique_paths": 14, - "stage2_duplicate_paths": 15, - "stage3_clustered": 20, - "stage3_strategy": "dir_rr", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "graph expansion", - "latency_ms": 2866.819000005722, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1875, - "rbo_topk": 0.06893318399142857, - "staged_unique_files_topk": 9, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 8, - "dense_unique_dirs_topk": 4 - }, - { - "query": "clustering strategy", - "staged": { - "strategy": "staged", - "query": "clustering strategy", - "latency_ms": 9090.759900003672, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 85.28780937194824, - "stage1_fallback_search_ms": 183.7012767791748, - "stage2_expand_ms": 5557.527780532837, - "stage3_cluster_ms": 0.001430511474609375, - "stage4_rerank_ms": 3164.6268367767334 - }, - "stage_counts": { - "stage1_candidates": 10, - "stage1_fallback_used": 1, - "stage2_expanded": 10, - "stage2_unique_paths": 10, - "stage2_duplicate_paths": 0, - "stage3_clustered": 10, - "stage3_strategy": "dir_rr", - "stage4_reranked": 10 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "clustering strategy", - "latency_ms": 3062.4616000056267, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.04670528456571428, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 4 - }, - { - "query": "error handling", - "staged": { - "strategy": "staged", - "query": "error handling", - "latency_ms": 19777.87659996748, - "num_results": 6, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 65.9482479095459, - "stage1_fallback_search_ms": 181.9770336151123, - "stage2_expand_ms": 16960.813760757446, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 2472.1477031707764 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 13, - "stage2_unique_paths": 6, - "stage2_duplicate_paths": 7, - "stage3_clustered": 13, - "stage3_strategy": "dir_rr", - "stage4_reranked": 13 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "error handling", - "latency_ms": 2854.169200003147, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "how to parse json", - "staged": { - "strategy": "staged", - "query": "how to parse json", - "latency_ms": 17351.04380002618, - "num_results": 7, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 119.1408634185791, - "stage1_fallback_search_ms": 246.2625503540039, - "stage2_expand_ms": 14137.234449386597, - "stage3_cluster_ms": 0.0011920928955078125, - "stage4_rerank_ms": 2750.417470932007 - }, - "stage_counts": { - "stage1_candidates": 4, - "stage1_fallback_used": 1, - "stage2_expanded": 11, - "stage2_unique_paths": 7, - "stage2_duplicate_paths": 4, - "stage3_clustered": 11, - "stage3_strategy": "dir_rr", - "stage4_reranked": 11 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "how to parse json", - "latency_ms": 3135.3772000074387, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.21428571428571427, - "rbo_topk": 0.16767719827714284, - "staged_unique_files_topk": 7, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/compare_2026-02-09_dir_rr_fast6.json b/codex-lens/benchmarks/results/compare_2026-02-09_dir_rr_fast6.json deleted file mode 100644 index bdc35197..00000000 --- a/codex-lens/benchmarks/results/compare_2026-02-09_dir_rr_fast6.json +++ /dev/null @@ -1,467 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-09 20:56:02", - "source": "src", - "k": 10, - "coarse_k": 100, - "query_count": 7, - "avg_jaccard_topk": 0.11350467619264612, - "avg_rbo_topk": 0.09062624799510204, - "staged": { - "success": 7, - "avg_latency_ms": 8679.35167142323 - }, - "dense_rerank": { - "success": 7, - "avg_latency_ms": 3097.294714289052 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 6814.465099990368, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 85.55030822753906, - "stage1_fallback_search_ms": 197.95989990234375, - "stage2_expand_ms": 3032.4549674987793, - "stage3_cluster_ms": 1.1937618255615234, - "stage4_rerank_ms": 3402.9476642608643 - }, - "stage_counts": { - "stage1_candidates": 37, - "stage1_fallback_used": 1, - "stage2_expanded": 86, - "stage2_unique_paths": 53, - "stage2_duplicate_paths": 33, - "stage3_clustered": 20, - "stage3_strategy": "dir_rr", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 3175.0339000225067, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.05263157894736842, - "rbo_topk": 0.014635885139999999, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 8, - "dense_unique_dirs_topk": 4 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 8990.238099992275, - "num_results": 3, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 90.6367301940918, - "stage2_expand_ms": 6272.260665893555, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 2531.4290523529053 - }, - "stage_counts": { - "stage1_candidates": 3, - "stage2_expanded": 4, - "stage2_unique_paths": 3, - "stage2_duplicate_paths": 1, - "stage3_clustered": 4, - "stage3_strategy": "dir_rr", - "stage4_reranked": 4 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 3434.4095999896526, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.09090909090909091, - "rbo_topk": 0.23541639942571424, - "staged_unique_files_topk": 2, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 9296.205000013113, - "num_results": 7, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 86.64774894714355, - "stage1_fallback_search_ms": 163.8650894165039, - "stage2_expand_ms": 6144.1497802734375, - "stage3_cluster_ms": 0.4100799560546875, - "stage4_rerank_ms": 2807.274580001831 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 31, - "stage2_unique_paths": 11, - "stage2_duplicate_paths": 20, - "stage3_clustered": 20, - "stage3_strategy": "dir_rr", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 3043.4417999982834, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.06666666666666667, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 6, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "graph expansion", - "staged": { - "strategy": "staged", - "query": "graph expansion", - "latency_ms": 9086.15110000968, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 72.22437858581543, - "stage1_fallback_search_ms": 166.3804054260254, - "stage2_expand_ms": 6179.303169250488, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 2575.9027004241943 - }, - "stage_counts": { - "stage1_candidates": 11, - "stage1_fallback_used": 1, - "stage2_expanded": 16, - "stage2_unique_paths": 13, - "stage2_duplicate_paths": 3, - "stage3_clustered": 16, - "stage3_strategy": "dir_rr", - "stage4_reranked": 16 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "graph expansion", - "latency_ms": 2793.8257000148296, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1875, - "rbo_topk": 0.06134116970571428, - "staged_unique_files_topk": 9, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 7, - "dense_unique_dirs_topk": 4 - }, - { - "query": "clustering strategy", - "staged": { - "strategy": "staged", - "query": "clustering strategy", - "latency_ms": 8401.927499979734, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 72.67880439758301, - "stage1_fallback_search_ms": 166.71442985534668, - "stage2_expand_ms": 5561.89489364624, - "stage3_cluster_ms": 0.0007152557373046875, - "stage4_rerank_ms": 2517.7178382873535 - }, - "stage_counts": { - "stage1_candidates": 10, - "stage1_fallback_used": 1, - "stage2_expanded": 10, - "stage2_unique_paths": 10, - "stage2_duplicate_paths": 0, - "stage3_clustered": 10, - "stage3_strategy": "dir_rr", - "stage4_reranked": 10 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "clustering strategy", - "latency_ms": 3192.0045999884605, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.04670528456571428, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 4 - }, - { - "query": "error handling", - "staged": { - "strategy": "staged", - "query": "error handling", - "latency_ms": 9032.269400000572, - "num_results": 6, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 78.59635353088379, - "stage1_fallback_search_ms": 180.96280097961426, - "stage2_expand_ms": 6175.840377807617, - "stage3_cluster_ms": 0.001430511474609375, - "stage4_rerank_ms": 2503.4260749816895 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 13, - "stage2_unique_paths": 6, - "stage2_duplicate_paths": 7, - "stage3_clustered": 13, - "stage3_strategy": "dir_rr", - "stage4_reranked": 13 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "error handling", - "latency_ms": 3076.744800001383, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "how to parse json", - "staged": { - "strategy": "staged", - "query": "how to parse json", - "latency_ms": 9134.205499976873, - "num_results": 7, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 117.79379844665527, - "stage1_fallback_search_ms": 187.53886222839355, - "stage2_expand_ms": 6218.849658966064, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 2515.6633853912354 - }, - "stage_counts": { - "stage1_candidates": 4, - "stage1_fallback_used": 1, - "stage2_expanded": 9, - "stage2_unique_paths": 7, - "stage2_duplicate_paths": 2, - "stage3_clustered": 9, - "stage3_strategy": "dir_rr", - "stage4_reranked": 9 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "how to parse json", - "latency_ms": 2965.6026000082493, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.21428571428571427, - "rbo_topk": 0.18590219827714285, - "staged_unique_files_topk": 7, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/compare_2026-02-09_keepalive3.json b/codex-lens/benchmarks/results/compare_2026-02-09_keepalive3.json deleted file mode 100644 index 759bc32e..00000000 --- a/codex-lens/benchmarks/results/compare_2026-02-09_keepalive3.json +++ /dev/null @@ -1,171 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-09 19:16:45", - "source": "src", - "k": 10, - "coarse_k": 100, - "query_count": 3, - "avg_jaccard_topk": 0.07165641376167692, - "avg_rbo_topk": 0.10859973275904759, - "staged": { - "success": 3, - "avg_latency_ms": 7919.317766676347 - }, - "dense_rerank": { - "success": 3, - "avg_latency_ms": 2812.574933330218 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 6351.961700022221, - "num_results": 37, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\litellm_reranker.py" - ], - "stage_stats": null, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 4424.698300004005, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.05263157894736842, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 17239.81479999423, - "num_results": 3, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 18.40996742248535, - "stage2_expand_ms": 16024.681329727173, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 1160.1319313049316 - }, - "stage_counts": { - "stage1_candidates": 3, - "stage2_expanded": 4, - "stage2_unique_paths": 3, - "stage2_duplicate_paths": 1, - "stage3_clustered": 4, - "stage3_strategy": "score", - "stage4_reranked": 4 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 2086.8772999942303, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.09090909090909091, - "rbo_topk": 0.23541639942571424, - "staged_unique_files_topk": 2, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 166.1768000125885, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "stage_stats": null, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 1926.1491999924183, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/compare_2026-02-09_keepalive3b.json b/codex-lens/benchmarks/results/compare_2026-02-09_keepalive3b.json deleted file mode 100644 index dfb1d3e1..00000000 --- a/codex-lens/benchmarks/results/compare_2026-02-09_keepalive3b.json +++ /dev/null @@ -1,171 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-09 19:19:13", - "source": "src", - "k": 10, - "coarse_k": 100, - "query_count": 3, - "avg_jaccard_topk": 0.07165641376167692, - "avg_rbo_topk": 0.10859973275904759, - "staged": { - "success": 3, - "avg_latency_ms": 8272.264699995518 - }, - "dense_rerank": { - "success": 3, - "avg_latency_ms": 2753.5123999913535 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 6453.665100008249, - "num_results": 37, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\litellm_reranker.py" - ], - "stage_stats": null, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 4530.146999955177, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.05263157894736842, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 18202.905599981546, - "num_results": 3, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 15.580177307128906, - "stage2_expand_ms": 16622.225522994995, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 1516.9692039489746 - }, - "stage_counts": { - "stage1_candidates": 3, - "stage2_expanded": 4, - "stage2_unique_paths": 3, - "stage2_duplicate_paths": 1, - "stage3_clustered": 4, - "stage3_strategy": "score", - "stage4_reranked": 4 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 1746.9925000071526, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.09090909090909091, - "rbo_topk": 0.23541639942571424, - "staged_unique_files_topk": 2, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 160.2233999967575, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "stage_stats": null, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 1983.3977000117302, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/compare_2026-02-09_run2.json b/codex-lens/benchmarks/results/compare_2026-02-09_run2.json deleted file mode 100644 index 7dc36661..00000000 --- a/codex-lens/benchmarks/results/compare_2026-02-09_run2.json +++ /dev/null @@ -1,453 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-09 11:26:54", - "source": "src", - "k": 10, - "coarse_k": 100, - "query_count": 7, - "avg_jaccard_topk": 0.39589733329229126, - "avg_rbo_topk": 0.23139636799510202, - "staged": { - "success": 7, - "avg_latency_ms": 32194.107242865222 - }, - "dense_rerank": { - "success": 7, - "avg_latency_ms": 2643.366857132741 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 43041.41250002384, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 9864.638805389404, - "stage2_expand_ms": 13012.29190826416, - "stage3_cluster_ms": 13297.565460205078, - "stage4_rerank_ms": 6821.892261505127 - }, - "stage_counts": { - "stage1_candidates": 100, - "stage2_expanded": 149, - "stage3_clustered": 20, - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 3209.129799991846, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.05429729885142857, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 8, - "dense_unique_dirs_topk": 4 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 37827.209600031376, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 531.8794250488281, - "stage2_expand_ms": 27009.481191635132, - "stage3_cluster_ms": 7948.509931564331, - "stage4_rerank_ms": 2268.9380645751953 - }, - "stage_counts": { - "stage1_candidates": 100, - "stage2_expanded": 101, - "stage3_clustered": 20, - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 2540.472400009632, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.26666666666666666, - "rbo_topk": 0.2983708721671428, - "staged_unique_files_topk": 9, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 24744.686599999666, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\merkle_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 517.8542137145996, - "stage2_expand_ms": 12839.622735977173, - "stage3_cluster_ms": 9154.959678649902, - "stage4_rerank_ms": 2160.0701808929443 - }, - "stage_counts": { - "stage1_candidates": 100, - "stage2_expanded": 100, - "stage3_clustered": 20, - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 2482.5908999741077, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.5384615384615384, - "rbo_topk": 0.36639083062285716, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "graph expansion", - "staged": { - "strategy": "staged", - "query": "graph expansion", - "latency_ms": 25239.59050002694, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 631.9081783294678, - "stage2_expand_ms": 12570.756196975708, - "stage3_cluster_ms": 9557.724952697754, - "stage4_rerank_ms": 2409.7683429718018 - }, - "stage_counts": { - "stage1_candidates": 100, - "stage2_expanded": 100, - "stage3_clustered": 20, - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "graph expansion", - "latency_ms": 2574.1938000023365, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.42857142857142855, - "rbo_topk": 0.13728894791142857, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "clustering strategy", - "staged": { - "strategy": "staged", - "query": "clustering strategy", - "latency_ms": 28572.93939998746, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 659.6193313598633, - "stage2_expand_ms": 14207.426309585571, - "stage3_cluster_ms": 11513.370037078857, - "stage4_rerank_ms": 2117.546319961548 - }, - "stage_counts": { - "stage1_candidates": 100, - "stage2_expanded": 100, - "stage3_clustered": 20, - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "clustering strategy", - "latency_ms": 2536.551799982786, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.17647058823529413, - "rbo_topk": 0.07116480920571429, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "error handling", - "staged": { - "strategy": "staged", - "query": "error handling", - "latency_ms": 23812.726000010967, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 475.42428970336914, - "stage2_expand_ms": 12454.935789108276, - "stage3_cluster_ms": 8576.019525527954, - "stage4_rerank_ms": 2265.360116958618 - }, - "stage_counts": { - "stage1_candidates": 100, - "stage2_expanded": 100, - "stage3_clustered": 20, - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "error handling", - "latency_ms": 2648.7773999869823, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.6666666666666666, - "rbo_topk": 0.21230026104857144, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "how to parse json", - "staged": { - "strategy": "staged", - "query": "how to parse json", - "latency_ms": 42120.1860999763, - "num_results": 9, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 570.8920955657959, - "stage2_expand_ms": 30054.06880378723, - "stage3_cluster_ms": 9285.51697731018, - "stage4_rerank_ms": 2142.771005630493 - }, - "stage_counts": { - "stage1_candidates": 100, - "stage2_expanded": 100, - "stage3_clustered": 20, - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "how to parse json", - "latency_ms": 2511.8518999814987, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.5833333333333334, - "rbo_topk": 0.4799615561585714, - "staged_unique_files_topk": 9, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/compare_2026-02-09_score_fast3.json b/codex-lens/benchmarks/results/compare_2026-02-09_score_fast3.json deleted file mode 100644 index 7e0f1132..00000000 --- a/codex-lens/benchmarks/results/compare_2026-02-09_score_fast3.json +++ /dev/null @@ -1,208 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-09 17:27:26", - "source": "src", - "k": 10, - "coarse_k": 100, - "query_count": 3, - "avg_jaccard_topk": 0.5809523809523809, - "avg_rbo_topk": 0.31359567182809517, - "staged": { - "success": 3, - "avg_latency_ms": 22826.711433331173 - }, - "dense_rerank": { - "success": 3, - "avg_latency_ms": 2239.804533312718 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 26690.878500014544, - "num_results": 6, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 8534.121036529541, - "stage2_expand_ms": 13298.827648162842, - "stage3_cluster_ms": 0.026226043701171875, - "stage4_rerank_ms": 4805.774688720703 - }, - "stage_counts": { - "stage1_candidates": 100, - "stage2_expanded": 149, - "stage2_unique_paths": 43, - "stage2_duplicate_paths": 106, - "stage3_clustered": 20, - "stage3_strategy": "score", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 2416.653799980879, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.14285714285714285, - "rbo_topk": 0.25764429885142853, - "staged_unique_files_topk": 6, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 4 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 26188.838399976492, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 525.7587432861328, - "stage2_expand_ms": 23659.400939941406, - "stage3_cluster_ms": 0.021696090698242188, - "stage4_rerank_ms": 1928.950309753418 - }, - "stage_counts": { - "stage1_candidates": 100, - "stage2_expanded": 101, - "stage2_unique_paths": 23, - "stage2_duplicate_paths": 78, - "stage3_clustered": 20, - "stage3_strategy": "score", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 1953.0992999970913, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.9, - "rbo_topk": 0.39374892065285705, - "staged_unique_files_topk": 9, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 15600.41740000248, - "num_results": 7, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 475.54636001586914, - "stage2_expand_ms": 13318.811893463135, - "stage3_cluster_ms": 0.03218650817871094, - "stage4_rerank_ms": 1755.7547092437744 - }, - "stage_counts": { - "stage1_candidates": 100, - "stage2_expanded": 100, - "stage2_unique_paths": 21, - "stage2_duplicate_paths": 79, - "stage3_clustered": 20, - "stage3_strategy": "score", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 2349.660499960184, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.7, - "rbo_topk": 0.28939379598, - "staged_unique_files_topk": 7, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 4 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/compare_2026-02-09_score_fast4.json b/codex-lens/benchmarks/results/compare_2026-02-09_score_fast4.json deleted file mode 100644 index ef073667..00000000 --- a/codex-lens/benchmarks/results/compare_2026-02-09_score_fast4.json +++ /dev/null @@ -1,356 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-09 20:36:02", - "source": "src", - "k": 10, - "coarse_k": 100, - "query_count": 7, - "avg_jaccard_topk": 0.12095811211246858, - "avg_rbo_topk": 0.09594444061244897, - "staged": { - "success": 7, - "avg_latency_ms": 2436.7641000066483 - }, - "dense_rerank": { - "success": 7, - "avg_latency_ms": 2593.7630428629263 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 285.091000020504, - "num_results": 37, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\litellm_reranker.py" - ], - "stage_stats": null, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 2412.1290000081062, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.05263157894736842, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 15029.73520001769, - "num_results": 3, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 101.95636749267578, - "stage2_expand_ms": 12690.008640289307, - "stage3_cluster_ms": 0.001430511474609375, - "stage4_rerank_ms": 2155.757427215576 - }, - "stage_counts": { - "stage1_candidates": 3, - "stage2_expanded": 4, - "stage2_unique_paths": 3, - "stage2_duplicate_paths": 1, - "stage3_clustered": 4, - "stage3_strategy": "score", - "stage4_reranked": 4 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 2424.7003000080585, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.09090909090909091, - "rbo_topk": 0.23541639942571424, - "staged_unique_files_topk": 2, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 324.4240999817848, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "stage_stats": null, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 2497.174100011587, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "graph expansion", - "staged": { - "strategy": "staged", - "query": "graph expansion", - "latency_ms": 359.32159999012947, - "num_results": 11, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\hybrid_search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "stage_stats": null, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "graph expansion", - "latency_ms": 2553.8585999906063, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.17647058823529413, - "rbo_topk": 0.06801300374142856, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 7, - "dense_unique_dirs_topk": 4 - }, - { - "query": "clustering strategy", - "staged": { - "strategy": "staged", - "query": "clustering strategy", - "latency_ms": 286.38240000605583, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py" - ], - "stage_stats": null, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "clustering strategy", - "latency_ms": 2570.379099994898, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.04670528456571428, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 4 - }, - { - "query": "error handling", - "staged": { - "strategy": "staged", - "query": "error handling", - "latency_ms": 412.58780002593994, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "stage_stats": null, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "error handling", - "latency_ms": 2894.3279000222683, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "how to parse json", - "staged": { - "strategy": "staged", - "query": "how to parse json", - "latency_ms": 359.8066000044346, - "num_results": 4, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "stage_stats": null, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "how to parse json", - "latency_ms": 2803.772300004959, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.2727272727272727, - "rbo_topk": 0.18590219827714285, - "staged_unique_files_topk": 4, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 4 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/compare_2026-02-09_score_fast5.json b/codex-lens/benchmarks/results/compare_2026-02-09_score_fast5.json deleted file mode 100644 index dbea8924..00000000 --- a/codex-lens/benchmarks/results/compare_2026-02-09_score_fast5.json +++ /dev/null @@ -1,462 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-09 20:45:10", - "source": "src", - "k": 10, - "coarse_k": 100, - "query_count": 7, - "avg_jaccard_topk": 0.1283498247783962, - "avg_rbo_topk": 0.09664773770897958, - "staged": { - "success": 7, - "avg_latency_ms": 16394.152085712976 - }, - "dense_rerank": { - "success": 7, - "avg_latency_ms": 2839.464457145759 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 6233.342700004578, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 125.80323219299316, - "stage1_fallback_search_ms": 277.1914005279541, - "stage2_expand_ms": 3032.3121547698975, - "stage3_cluster_ms": 0.02765655517578125, - "stage4_rerank_ms": 2699.3532180786133 - }, - "stage_counts": { - "stage1_candidates": 37, - "stage1_fallback_used": 1, - "stage2_expanded": 86, - "stage2_unique_paths": 53, - "stage2_duplicate_paths": 33, - "stage3_clustered": 20, - "stage3_strategy": "score", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 3036.3474999964237, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.125, - "rbo_topk": 0.06741929885142856, - "staged_unique_files_topk": 8, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 12703.503900021315, - "num_results": 3, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 83.4202766418457, - "stage2_expand_ms": 9856.60433769226, - "stage3_cluster_ms": 0.0011920928955078125, - "stage4_rerank_ms": 2664.630174636841 - }, - "stage_counts": { - "stage1_candidates": 3, - "stage2_expanded": 4, - "stage2_unique_paths": 3, - "stage2_duplicate_paths": 1, - "stage3_clustered": 4, - "stage3_strategy": "score", - "stage4_reranked": 4 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 2888.501700013876, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.09090909090909091, - "rbo_topk": 0.23541639942571424, - "staged_unique_files_topk": 2, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 33684.76710000634, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 78.8118839263916, - "stage1_fallback_search_ms": 174.6652126312256, - "stage2_expand_ms": 31018.909692764282, - "stage3_cluster_ms": 0.0016689300537109375, - "stage4_rerank_ms": 2316.9021606445312 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 5, - "stage2_unique_paths": 5, - "stage2_duplicate_paths": 0, - "stage3_clustered": 5, - "stage3_strategy": "score", - "stage4_reranked": 5 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 2824.729699999094, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "graph expansion", - "staged": { - "strategy": "staged", - "query": "graph expansion", - "latency_ms": 16910.090099990368, - "num_results": 8, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 99.6243953704834, - "stage1_fallback_search_ms": 207.89742469787598, - "stage2_expand_ms": 13929.257154464722, - "stage3_cluster_ms": 0.016927719116210938, - "stage4_rerank_ms": 2586.843729019165 - }, - "stage_counts": { - "stage1_candidates": 11, - "stage1_fallback_used": 1, - "stage2_expanded": 29, - "stage2_unique_paths": 14, - "stage2_duplicate_paths": 15, - "stage3_clustered": 20, - "stage3_strategy": "score", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "graph expansion", - "latency_ms": 2765.958099991083, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.21428571428571427, - "rbo_topk": 0.06893318399142857, - "staged_unique_files_topk": 7, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 6, - "dense_unique_dirs_topk": 4 - }, - { - "query": "clustering strategy", - "staged": { - "strategy": "staged", - "query": "clustering strategy", - "latency_ms": 8380.20839998126, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 95.42632102966309, - "stage1_fallback_search_ms": 187.4692440032959, - "stage2_expand_ms": 5561.658143997192, - "stage3_cluster_ms": 0.0007152557373046875, - "stage4_rerank_ms": 2441.287040710449 - }, - "stage_counts": { - "stage1_candidates": 10, - "stage1_fallback_used": 1, - "stage2_expanded": 10, - "stage2_unique_paths": 10, - "stage2_duplicate_paths": 0, - "stage3_clustered": 10, - "stage3_strategy": "score", - "stage4_reranked": 10 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "clustering strategy", - "latency_ms": 2788.0665000081062, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.04670528456571428, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 4 - }, - { - "query": "error handling", - "staged": { - "strategy": "staged", - "query": "error handling", - "latency_ms": 19897.71709999442, - "num_results": 6, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 114.1653060913086, - "stage1_fallback_search_ms": 235.73827743530273, - "stage2_expand_ms": 16702.077865600586, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 2757.4093341827393 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 13, - "stage2_unique_paths": 6, - "stage2_duplicate_paths": 7, - "stage3_clustered": 13, - "stage3_strategy": "score", - "stage4_reranked": 13 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "error handling", - "latency_ms": 2874.178600013256, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "how to parse json", - "staged": { - "strategy": "staged", - "query": "how to parse json", - "latency_ms": 16949.43529999256, - "num_results": 7, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 104.50935363769531, - "stage1_fallback_search_ms": 190.6723976135254, - "stage2_expand_ms": 14165.841102600098, - "stage3_cluster_ms": 0.0011920928955078125, - "stage4_rerank_ms": 2399.226188659668 - }, - "stage_counts": { - "stage1_candidates": 4, - "stage1_fallback_used": 1, - "stage2_expanded": 11, - "stage2_unique_paths": 7, - "stage2_duplicate_paths": 4, - "stage3_clustered": 11, - "stage3_strategy": "score", - "stage4_reranked": 11 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "how to parse json", - "latency_ms": 2698.469099998474, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.21428571428571427, - "rbo_topk": 0.16767719827714284, - "staged_unique_files_topk": 7, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/compare_2026-02-09_score_fast6.json b/codex-lens/benchmarks/results/compare_2026-02-09_score_fast6.json deleted file mode 100644 index d76156dc..00000000 --- a/codex-lens/benchmarks/results/compare_2026-02-09_score_fast6.json +++ /dev/null @@ -1,465 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-09 20:53:01", - "source": "src", - "k": 10, - "coarse_k": 100, - "query_count": 7, - "avg_jaccard_topk": 0.12384302205730777, - "avg_rbo_topk": 0.09816673566816325, - "staged": { - "success": 7, - "avg_latency_ms": 8696.564499999795 - }, - "dense_rerank": { - "success": 7, - "avg_latency_ms": 2936.2583857136115 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 6108.304299980402, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 90.47985076904297, - "stage1_fallback_search_ms": 224.38788414001465, - "stage2_expand_ms": 3031.7258834838867, - "stage3_cluster_ms": 0.02956390380859375, - "stage4_rerank_ms": 2655.31849861145 - }, - "stage_counts": { - "stage1_candidates": 37, - "stage1_fallback_used": 1, - "stage2_expanded": 86, - "stage2_unique_paths": 53, - "stage2_duplicate_paths": 33, - "stage3_clustered": 20, - "stage3_strategy": "score", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 2873.6466999948025, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.125, - "rbo_topk": 0.06741929885142856, - "staged_unique_files_topk": 8, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 9321.754200011492, - "num_results": 3, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 140.43283462524414, - "stage2_expand_ms": 6410.467863082886, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 2675.7972240448 - }, - "stage_counts": { - "stage1_candidates": 3, - "stage2_expanded": 4, - "stage2_unique_paths": 3, - "stage2_duplicate_paths": 1, - "stage3_clustered": 4, - "stage3_strategy": "score", - "stage4_reranked": 4 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 3104.7773999869823, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.09090909090909091, - "rbo_topk": 0.23541639942571424, - "staged_unique_files_topk": 2, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 9527.073799997568, - "num_results": 6, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 98.59919548034668, - "stage1_fallback_search_ms": 172.26457595825195, - "stage2_expand_ms": 6125.282049179077, - "stage3_cluster_ms": 0.017404556274414062, - "stage4_rerank_ms": 3023.9248275756836 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 31, - "stage2_unique_paths": 11, - "stage2_duplicate_paths": 20, - "stage3_clustered": 20, - "stage3_strategy": "score", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 2901.0302999913692, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.06666666666666667, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 6, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "graph expansion", - "staged": { - "strategy": "staged", - "query": "graph expansion", - "latency_ms": 9120.886200010777, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 91.48454666137695, - "stage1_fallback_search_ms": 172.12390899658203, - "stage2_expand_ms": 6166.24903678894, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 2601.947546005249 - }, - "stage_counts": { - "stage1_candidates": 11, - "stage1_fallback_used": 1, - "stage2_expanded": 16, - "stage2_unique_paths": 13, - "stage2_duplicate_paths": 3, - "stage3_clustered": 16, - "stage3_strategy": "score", - "stage4_reranked": 16 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "graph expansion", - "latency_ms": 2847.6964999735355, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1875, - "rbo_topk": 0.06134116970571428, - "staged_unique_files_topk": 9, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 7, - "dense_unique_dirs_topk": 4 - }, - { - "query": "clustering strategy", - "staged": { - "strategy": "staged", - "query": "clustering strategy", - "latency_ms": 8424.535699993372, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 92.8945541381836, - "stage1_fallback_search_ms": 192.06547737121582, - "stage2_expand_ms": 5568.126440048218, - "stage3_cluster_ms": 0.0011920928955078125, - "stage4_rerank_ms": 2480.673313140869 - }, - "stage_counts": { - "stage1_candidates": 10, - "stage1_fallback_used": 1, - "stage2_expanded": 10, - "stage2_unique_paths": 10, - "stage2_duplicate_paths": 0, - "stage3_clustered": 10, - "stage3_strategy": "score", - "stage4_reranked": 10 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "clustering strategy", - "latency_ms": 2974.9999000132084, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.04670528456571428, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 4 - }, - { - "query": "error handling", - "staged": { - "strategy": "staged", - "query": "error handling", - "latency_ms": 9253.624700009823, - "num_results": 6, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 102.18691825866699, - "stage1_fallback_search_ms": 176.97691917419434, - "stage2_expand_ms": 6113.626480102539, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 2774.4452953338623 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 13, - "stage2_unique_paths": 6, - "stage2_duplicate_paths": 7, - "stage3_clustered": 13, - "stage3_strategy": "score", - "stage4_reranked": 13 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "error handling", - "latency_ms": 2860.619900047779, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "how to parse json", - "staged": { - "strategy": "staged", - "query": "how to parse json", - "latency_ms": 9119.772599995136, - "num_results": 7, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 90.18850326538086, - "stage1_fallback_search_ms": 157.95397758483887, - "stage2_expand_ms": 6293.469429016113, - "stage3_cluster_ms": 0.0011920928955078125, - "stage4_rerank_ms": 2486.8383407592773 - }, - "stage_counts": { - "stage1_candidates": 4, - "stage1_fallback_used": 1, - "stage2_expanded": 9, - "stage2_unique_paths": 7, - "stage2_duplicate_paths": 2, - "stage3_clustered": 9, - "stage3_strategy": "score", - "stage4_reranked": 9 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "how to parse json", - "latency_ms": 2991.0379999876022, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.21428571428571427, - "rbo_topk": 0.18590219827714285, - "staged_unique_files_topk": 7, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/compare_2026-02-09_score_fast7.json b/codex-lens/benchmarks/results/compare_2026-02-09_score_fast7.json deleted file mode 100644 index e8cb30da..00000000 --- a/codex-lens/benchmarks/results/compare_2026-02-09_score_fast7.json +++ /dev/null @@ -1,465 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-10 12:23:36", - "source": "src", - "k": 10, - "coarse_k": 100, - "query_count": 7, - "avg_jaccard_topk": 0.12384302205730777, - "avg_rbo_topk": 0.09816673566816325, - "staged": { - "success": 7, - "avg_latency_ms": 3996.4113285754406 - }, - "dense_rerank": { - "success": 7, - "avg_latency_ms": 2780.485200004918 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 2365.3048999905586, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 25.228023529052734, - "stage1_fallback_search_ms": 206.0999870300293, - "stage2_expand_ms": 16.644954681396484, - "stage3_cluster_ms": 0.025987625122070312, - "stage4_rerank_ms": 2064.2504692077637 - }, - "stage_counts": { - "stage1_candidates": 37, - "stage1_fallback_used": 1, - "stage2_expanded": 86, - "stage2_unique_paths": 53, - "stage2_duplicate_paths": 33, - "stage3_clustered": 20, - "stage3_strategy": "score", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 2610.047899991274, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.125, - "rbo_topk": 0.06741929885142856, - "staged_unique_files_topk": 8, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 3723.305599987507, - "num_results": 3, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 31.742334365844727, - "stage2_expand_ms": 2125.1025199890137, - "stage3_cluster_ms": 0.0007152557373046875, - "stage4_rerank_ms": 1511.4071369171143 - }, - "stage_counts": { - "stage1_candidates": 3, - "stage2_expanded": 4, - "stage2_unique_paths": 3, - "stage2_duplicate_paths": 1, - "stage3_clustered": 4, - "stage3_strategy": "score", - "stage4_reranked": 4 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 2072.4792000055313, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.09090909090909091, - "rbo_topk": 0.23541639942571424, - "staged_unique_files_topk": 2, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 5251.151299983263, - "num_results": 6, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 32.721757888793945, - "stage1_fallback_search_ms": 195.51420211791992, - "stage2_expand_ms": 2060.0733757019043, - "stage3_cluster_ms": 0.0095367431640625, - "stage4_rerank_ms": 2900.8395671844482 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 31, - "stage2_unique_paths": 11, - "stage2_duplicate_paths": 20, - "stage3_clustered": 20, - "stage3_strategy": "score", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 1972.8982000350952, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.06666666666666667, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 6, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "graph expansion", - "staged": { - "strategy": "staged", - "query": "graph expansion", - "latency_ms": 4101.171400010586, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 29.141902923583984, - "stage1_fallback_search_ms": 234.2982292175293, - "stage2_expand_ms": 2082.4878215789795, - "stage3_cluster_ms": 0.0011920928955078125, - "stage4_rerank_ms": 1698.7183094024658 - }, - "stage_counts": { - "stage1_candidates": 11, - "stage1_fallback_used": 1, - "stage2_expanded": 16, - "stage2_unique_paths": 13, - "stage2_duplicate_paths": 3, - "stage3_clustered": 16, - "stage3_strategy": "score", - "stage4_reranked": 16 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "graph expansion", - "latency_ms": 2331.9747000038624, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1875, - "rbo_topk": 0.06134116970571428, - "staged_unique_files_topk": 9, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 7, - "dense_unique_dirs_topk": 4 - }, - { - "query": "clustering strategy", - "staged": { - "strategy": "staged", - "query": "clustering strategy", - "latency_ms": 4032.0041000247, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 42.098283767700195, - "stage1_fallback_search_ms": 209.6574306488037, - "stage2_expand_ms": 2053.9097785949707, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 1665.3883457183838 - }, - "stage_counts": { - "stage1_candidates": 10, - "stage1_fallback_used": 1, - "stage2_expanded": 10, - "stage2_unique_paths": 10, - "stage2_duplicate_paths": 0, - "stage3_clustered": 10, - "stage3_strategy": "score", - "stage4_reranked": 10 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "clustering strategy", - "latency_ms": 2026.5661999881268, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.04670528456571428, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 4 - }, - { - "query": "error handling", - "staged": { - "strategy": "staged", - "query": "error handling", - "latency_ms": 4237.893900036812, - "num_results": 6, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 64.01538848876953, - "stage1_fallback_search_ms": 225.14033317565918, - "stage2_expand_ms": 2116.3012981414795, - "stage3_cluster_ms": 0.0007152557373046875, - "stage4_rerank_ms": 1776.0803699493408 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 13, - "stage2_unique_paths": 6, - "stage2_duplicate_paths": 7, - "stage3_clustered": 13, - "stage3_strategy": "score", - "stage4_reranked": 13 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "error handling", - "latency_ms": 2125.935900002718, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "how to parse json", - "staged": { - "strategy": "staged", - "query": "how to parse json", - "latency_ms": 4264.048099994659, - "num_results": 7, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 31.972646713256836, - "stage1_fallback_search_ms": 235.47840118408203, - "stage2_expand_ms": 2161.5889072418213, - "stage3_cluster_ms": 0.0007152557373046875, - "stage4_rerank_ms": 1768.0847644805908 - }, - "stage_counts": { - "stage1_candidates": 4, - "stage1_fallback_used": 1, - "stage2_expanded": 9, - "stage2_unique_paths": 7, - "stage2_duplicate_paths": 2, - "stage3_clustered": 9, - "stage3_strategy": "score", - "stage4_reranked": 9 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "how to parse json", - "latency_ms": 6323.49430000782, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.21428571428571427, - "rbo_topk": 0.18590219827714285, - "staged_unique_files_topk": 7, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/compare_2026-02-10_dir_rr_fast7.json b/codex-lens/benchmarks/results/compare_2026-02-10_dir_rr_fast7.json deleted file mode 100644 index 5a176d82..00000000 --- a/codex-lens/benchmarks/results/compare_2026-02-10_dir_rr_fast7.json +++ /dev/null @@ -1,467 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-10 12:46:47", - "source": "src", - "k": 10, - "coarse_k": 100, - "query_count": 7, - "avg_jaccard_topk": 0.11350467619264612, - "avg_rbo_topk": 0.09062624799510204, - "staged": { - "success": 7, - "avg_latency_ms": 5670.9065000244545 - }, - "dense_rerank": { - "success": 7, - "avg_latency_ms": 3047.475757143327 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 2971.5892000496387, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 108.11758041381836, - "stage1_fallback_search_ms": 230.96132278442383, - "stage2_expand_ms": 18.60976219177246, - "stage3_cluster_ms": 1.100301742553711, - "stage4_rerank_ms": 2528.761625289917 - }, - "stage_counts": { - "stage1_candidates": 37, - "stage1_fallback_used": 1, - "stage2_expanded": 86, - "stage2_unique_paths": 53, - "stage2_duplicate_paths": 33, - "stage3_clustered": 20, - "stage3_strategy": "dir_rr", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 2937.113800019026, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.05263157894736842, - "rbo_topk": 0.014635885139999999, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 8, - "dense_unique_dirs_topk": 4 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 10065.153400033712, - "num_results": 3, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 127.17461585998535, - "stage2_expand_ms": 7361.833810806274, - "stage3_cluster_ms": 0.001430511474609375, - "stage4_rerank_ms": 2472.7542400360107 - }, - "stage_counts": { - "stage1_candidates": 3, - "stage2_expanded": 4, - "stage2_unique_paths": 3, - "stage2_duplicate_paths": 1, - "stage3_clustered": 4, - "stage3_strategy": "dir_rr", - "stage4_reranked": 4 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 3059.5018000006676, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.09090909090909091, - "rbo_topk": 0.23541639942571424, - "staged_unique_files_topk": 2, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 5557.314100056887, - "num_results": 7, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 133.9263916015625, - "stage1_fallback_search_ms": 242.1243190765381, - "stage2_expand_ms": 2106.602430343628, - "stage3_cluster_ms": 0.47016143798828125, - "stage4_rerank_ms": 2967.3829078674316 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 31, - "stage2_unique_paths": 11, - "stage2_duplicate_paths": 20, - "stage3_clustered": 20, - "stage3_strategy": "dir_rr", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 3157.7918999791145, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.06666666666666667, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 6, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "graph expansion", - "staged": { - "strategy": "staged", - "query": "graph expansion", - "latency_ms": 5458.670999974012, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 113.62957954406738, - "stage1_fallback_search_ms": 204.56886291503906, - "stage2_expand_ms": 2166.4509773254395, - "stage3_cluster_ms": 0.0011920928955078125, - "stage4_rerank_ms": 2872.969627380371 - }, - "stage_counts": { - "stage1_candidates": 11, - "stage1_fallback_used": 1, - "stage2_expanded": 16, - "stage2_unique_paths": 13, - "stage2_duplicate_paths": 3, - "stage3_clustered": 16, - "stage3_strategy": "dir_rr", - "stage4_reranked": 16 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "graph expansion", - "latency_ms": 2896.5341999828815, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1875, - "rbo_topk": 0.06134116970571428, - "staged_unique_files_topk": 9, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 7, - "dense_unique_dirs_topk": 4 - }, - { - "query": "clustering strategy", - "staged": { - "strategy": "staged", - "query": "clustering strategy", - "latency_ms": 5028.861099988222, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 111.71293258666992, - "stage1_fallback_search_ms": 192.02208518981934, - "stage2_expand_ms": 2054.065465927124, - "stage3_cluster_ms": 0.0007152557373046875, - "stage4_rerank_ms": 2579.0507793426514 - }, - "stage_counts": { - "stage1_candidates": 10, - "stage1_fallback_used": 1, - "stage2_expanded": 10, - "stage2_unique_paths": 10, - "stage2_duplicate_paths": 0, - "stage3_clustered": 10, - "stage3_strategy": "dir_rr", - "stage4_reranked": 10 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "clustering strategy", - "latency_ms": 3627.1755999922752, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.04670528456571428, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 4 - }, - { - "query": "error handling", - "staged": { - "strategy": "staged", - "query": "error handling", - "latency_ms": 5114.356300055981, - "num_results": 6, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 135.76626777648926, - "stage1_fallback_search_ms": 211.12942695617676, - "stage2_expand_ms": 2151.059150695801, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 2519.892692565918 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 13, - "stage2_unique_paths": 6, - "stage2_duplicate_paths": 7, - "stage3_clustered": 13, - "stage3_strategy": "dir_rr", - "stage4_reranked": 13 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "error handling", - "latency_ms": 2853.594000041485, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "how to parse json", - "staged": { - "strategy": "staged", - "query": "how to parse json", - "latency_ms": 5500.400400012732, - "num_results": 7, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 96.66872024536133, - "stage1_fallback_search_ms": 176.37205123901367, - "stage2_expand_ms": 2137.751340866089, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 2991.840124130249 - }, - "stage_counts": { - "stage1_candidates": 4, - "stage1_fallback_used": 1, - "stage2_expanded": 9, - "stage2_unique_paths": 7, - "stage2_duplicate_paths": 2, - "stage3_clustered": 9, - "stage3_strategy": "dir_rr", - "stage4_reranked": 9 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "how to parse json", - "latency_ms": 2800.6189999878407, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.21428571428571427, - "rbo_topk": 0.18590219827714285, - "staged_unique_files_topk": 7, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/compare_2026-02-10_path_fast7.json b/codex-lens/benchmarks/results/compare_2026-02-10_path_fast7.json deleted file mode 100644 index 2038c691..00000000 --- a/codex-lens/benchmarks/results/compare_2026-02-10_path_fast7.json +++ /dev/null @@ -1,465 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-10 12:52:44", - "source": "src", - "k": 10, - "coarse_k": 100, - "query_count": 7, - "avg_jaccard_topk": 0.13455730777159347, - "avg_rbo_topk": 0.10274807844326529, - "staged": { - "success": 7, - "avg_latency_ms": 4445.262371412346 - }, - "dense_rerank": { - "success": 7, - "avg_latency_ms": 3327.1750857276575 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 2719.7998999655247, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 33.12373161315918, - "stage1_fallback_search_ms": 230.31878471374512, - "stage2_expand_ms": 22.444486618041992, - "stage3_cluster_ms": 0.06079673767089844, - "stage4_rerank_ms": 2338.5443687438965 - }, - "stage_counts": { - "stage1_candidates": 37, - "stage1_fallback_used": 1, - "stage2_expanded": 86, - "stage2_unique_paths": 53, - "stage2_duplicate_paths": 33, - "stage3_clustered": 20, - "stage3_strategy": "path", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 2334.8668000102043, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.2, - "rbo_topk": 0.09948869827714285, - "staged_unique_files_topk": 8, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 4470.056899994612, - "num_results": 3, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 28.5646915435791, - "stage2_expand_ms": 2216.57133102417, - "stage3_cluster_ms": 0.001430511474609375, - "stage4_rerank_ms": 2131.246566772461 - }, - "stage_counts": { - "stage1_candidates": 3, - "stage2_expanded": 4, - "stage2_unique_paths": 3, - "stage2_duplicate_paths": 1, - "stage3_clustered": 4, - "stage3_strategy": "path", - "stage4_reranked": 4 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 2447.341199964285, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.09090909090909091, - "rbo_topk": 0.23541639942571424, - "staged_unique_files_topk": 2, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 6126.65680000186, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 25.135278701782227, - "stage1_fallback_search_ms": 171.53453826904297, - "stage2_expand_ms": 2094.9013233184814, - "stage3_cluster_ms": 0.024318695068359375, - "stage4_rerank_ms": 3743.204355239868 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 31, - "stage2_unique_paths": 11, - "stage2_duplicate_paths": 20, - "stage3_clustered": 11, - "stage3_strategy": "path", - "stage4_reranked": 11 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 9015.508300036192, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.06666666666666667, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 6, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "graph expansion", - "staged": { - "strategy": "staged", - "query": "graph expansion", - "latency_ms": 4319.597599953413, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 18.799781799316406, - "stage1_fallback_search_ms": 167.36602783203125, - "stage2_expand_ms": 2101.4957427978516, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 1976.8805503845215 - }, - "stage_counts": { - "stage1_candidates": 11, - "stage1_fallback_used": 1, - "stage2_expanded": 16, - "stage2_unique_paths": 13, - "stage2_duplicate_paths": 3, - "stage3_clustered": 16, - "stage3_strategy": "path", - "stage4_reranked": 16 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "graph expansion", - "latency_ms": 2356.994699984789, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1875, - "rbo_topk": 0.06134116970571428, - "staged_unique_files_topk": 9, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 7, - "dense_unique_dirs_topk": 4 - }, - { - "query": "clustering strategy", - "staged": { - "strategy": "staged", - "query": "clustering strategy", - "latency_ms": 4574.691199988127, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 45.72629928588867, - "stage1_fallback_search_ms": 233.0036163330078, - "stage2_expand_ms": 2068.8536167144775, - "stage3_cluster_ms": 0.0007152557373046875, - "stage4_rerank_ms": 2152.9064178466797 - }, - "stage_counts": { - "stage1_candidates": 10, - "stage1_fallback_used": 1, - "stage2_expanded": 10, - "stage2_unique_paths": 10, - "stage2_duplicate_paths": 0, - "stage3_clustered": 10, - "stage3_strategy": "path", - "stage4_reranked": 10 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "clustering strategy", - "latency_ms": 2311.4787000119686, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.04670528456571428, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 4 - }, - { - "query": "error handling", - "staged": { - "strategy": "staged", - "query": "error handling", - "latency_ms": 4616.5374999940395, - "num_results": 6, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 38.83004188537598, - "stage1_fallback_search_ms": 263.0441188812256, - "stage2_expand_ms": 2070.7976818084717, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 2133.629083633423 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 13, - "stage2_unique_paths": 6, - "stage2_duplicate_paths": 7, - "stage3_clustered": 13, - "stage3_strategy": "path", - "stage4_reranked": 13 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "error handling", - "latency_ms": 2337.4413000643253, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "how to parse json", - "staged": { - "strategy": "staged", - "query": "how to parse json", - "latency_ms": 4289.496699988842, - "num_results": 7, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 34.40546989440918, - "stage1_fallback_search_ms": 231.8587303161621, - "stage2_expand_ms": 2068.8445568084717, - "stage3_cluster_ms": 0.0007152557373046875, - "stage4_rerank_ms": 1850.6083488464355 - }, - "stage_counts": { - "stage1_candidates": 4, - "stage1_fallback_used": 1, - "stage2_expanded": 9, - "stage2_unique_paths": 7, - "stage2_duplicate_paths": 2, - "stage3_clustered": 9, - "stage3_strategy": "path", - "stage4_reranked": 9 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "how to parse json", - "latency_ms": 2486.594600021839, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.21428571428571427, - "rbo_topk": 0.18590219827714285, - "staged_unique_files_topk": 7, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/compare_2026-02-10_score_fast7.json b/codex-lens/benchmarks/results/compare_2026-02-10_score_fast7.json deleted file mode 100644 index 34cb9d64..00000000 --- a/codex-lens/benchmarks/results/compare_2026-02-10_score_fast7.json +++ /dev/null @@ -1,465 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-10 12:44:24", - "source": "src", - "k": 10, - "coarse_k": 100, - "query_count": 7, - "avg_jaccard_topk": 0.12384302205730777, - "avg_rbo_topk": 0.09816673566816325, - "staged": { - "success": 7, - "avg_latency_ms": 4603.035771421024 - }, - "dense_rerank": { - "success": 7, - "avg_latency_ms": 2776.139728575945 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 3544.4309000074863, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 34.082651138305664, - "stage1_fallback_search_ms": 217.52095222473145, - "stage2_expand_ms": 18.847942352294922, - "stage3_cluster_ms": 0.031948089599609375, - "stage4_rerank_ms": 3176.4564514160156 - }, - "stage_counts": { - "stage1_candidates": 37, - "stage1_fallback_used": 1, - "stage2_expanded": 86, - "stage2_unique_paths": 53, - "stage2_duplicate_paths": 33, - "stage3_clustered": 20, - "stage3_strategy": "score", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 3075.5329999923706, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.125, - "rbo_topk": 0.06741929885142856, - "staged_unique_files_topk": 8, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 4371.493600010872, - "num_results": 3, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 29.517173767089844, - "stage2_expand_ms": 2236.224412918091, - "stage3_cluster_ms": 0.0007152557373046875, - "stage4_rerank_ms": 1998.866319656372 - }, - "stage_counts": { - "stage1_candidates": 3, - "stage2_expanded": 4, - "stage2_unique_paths": 3, - "stage2_duplicate_paths": 1, - "stage3_clustered": 4, - "stage3_strategy": "score", - "stage4_reranked": 4 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 2334.758200019598, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.09090909090909091, - "rbo_topk": 0.23541639942571424, - "staged_unique_files_topk": 2, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 4143.470999985933, - "num_results": 6, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 20.66636085510254, - "stage1_fallback_search_ms": 150.6054401397705, - "stage2_expand_ms": 2064.2361640930176, - "stage3_cluster_ms": 0.012159347534179688, - "stage4_rerank_ms": 1838.1483554840088 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 31, - "stage2_unique_paths": 11, - "stage2_duplicate_paths": 20, - "stage3_clustered": 20, - "stage3_strategy": "score", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 2207.86700001359, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.06666666666666667, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 6, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "graph expansion", - "staged": { - "strategy": "staged", - "query": "graph expansion", - "latency_ms": 4234.638899981976, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 21.48127555847168, - "stage1_fallback_search_ms": 153.59735488891602, - "stage2_expand_ms": 2092.521905899048, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 1876.7595291137695 - }, - "stage_counts": { - "stage1_candidates": 11, - "stage1_fallback_used": 1, - "stage2_expanded": 16, - "stage2_unique_paths": 13, - "stage2_duplicate_paths": 3, - "stage3_clustered": 16, - "stage3_strategy": "score", - "stage4_reranked": 16 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "graph expansion", - "latency_ms": 2646.9266000390053, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1875, - "rbo_topk": 0.06134116970571428, - "staged_unique_files_topk": 9, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 7, - "dense_unique_dirs_topk": 4 - }, - { - "query": "clustering strategy", - "staged": { - "strategy": "staged", - "query": "clustering strategy", - "latency_ms": 4778.165899991989, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 18.590688705444336, - "stage1_fallback_search_ms": 195.90282440185547, - "stage2_expand_ms": 2053.685426712036, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 2431.095838546753 - }, - "stage_counts": { - "stage1_candidates": 10, - "stage1_fallback_used": 1, - "stage2_expanded": 10, - "stage2_unique_paths": 10, - "stage2_duplicate_paths": 0, - "stage3_clustered": 10, - "stage3_strategy": "score", - "stage4_reranked": 10 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "clustering strategy", - "latency_ms": 2887.1304000020027, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.04670528456571428, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 4 - }, - { - "query": "error handling", - "staged": { - "strategy": "staged", - "query": "error handling", - "latency_ms": 5823.889799982309, - "num_results": 6, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 109.02619361877441, - "stage1_fallback_search_ms": 196.54059410095215, - "stage2_expand_ms": 2088.4640216827393, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 3328.0465602874756 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 13, - "stage2_unique_paths": 6, - "stage2_duplicate_paths": 7, - "stage3_clustered": 13, - "stage3_strategy": "score", - "stage4_reranked": 13 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "error handling", - "latency_ms": 3351.872999995947, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "how to parse json", - "staged": { - "strategy": "staged", - "query": "how to parse json", - "latency_ms": 5325.160299986601, - "num_results": 7, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 216.71128273010254, - "stage1_fallback_search_ms": 295.27878761291504, - "stage2_expand_ms": 2091.4883613586426, - "stage3_cluster_ms": 0.001430511474609375, - "stage4_rerank_ms": 2606.9161891937256 - }, - "stage_counts": { - "stage1_candidates": 4, - "stage1_fallback_used": 1, - "stage2_expanded": 9, - "stage2_unique_paths": 7, - "stage2_duplicate_paths": 2, - "stage3_clustered": 9, - "stage3_strategy": "score", - "stage4_reranked": 9 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "how to parse json", - "latency_ms": 2928.889899969101, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.21428571428571427, - "rbo_topk": 0.18590219827714285, - "staged_unique_files_topk": 7, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/compare_2026-02-11_dir_rr_fast7.json b/codex-lens/benchmarks/results/compare_2026-02-11_dir_rr_fast7.json deleted file mode 100644 index 61b1475f..00000000 --- a/codex-lens/benchmarks/results/compare_2026-02-11_dir_rr_fast7.json +++ /dev/null @@ -1,467 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-11 15:16:08", - "source": "codex-lens\\src", - "k": 10, - "coarse_k": 100, - "query_count": 7, - "avg_jaccard_topk": 0.11350467619264612, - "avg_rbo_topk": 0.09062624799510204, - "staged": { - "success": 7, - "avg_latency_ms": 4507.475014303412 - }, - "dense_rerank": { - "success": 7, - "avg_latency_ms": 2537.8563000304357 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 2474.800100028515, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 91.76826477050781, - "stage1_fallback_search_ms": 162.45269775390625, - "stage2_expand_ms": 14.957904815673828, - "stage3_cluster_ms": 0.8461475372314453, - "stage4_rerank_ms": 2129.7342777252197 - }, - "stage_counts": { - "stage1_candidates": 37, - "stage1_fallback_used": 1, - "stage2_expanded": 86, - "stage2_unique_paths": 53, - "stage2_duplicate_paths": 33, - "stage3_clustered": 20, - "stage3_strategy": "dir_rr", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 2425.3046000003815, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.05263157894736842, - "rbo_topk": 0.014635885139999999, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 8, - "dense_unique_dirs_topk": 4 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 5389.070900022984, - "num_results": 3, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 63.6446475982666, - "stage2_expand_ms": 3202.108144760132, - "stage3_cluster_ms": 0.0007152557373046875, - "stage4_rerank_ms": 2011.8708610534668 - }, - "stage_counts": { - "stage1_candidates": 3, - "stage2_expanded": 4, - "stage2_unique_paths": 3, - "stage2_duplicate_paths": 1, - "stage3_clustered": 4, - "stage3_strategy": "dir_rr", - "stage4_reranked": 4 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 2465.9148000478745, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.09090909090909091, - "rbo_topk": 0.23541639942571424, - "staged_unique_files_topk": 2, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 4989.407700002193, - "num_results": 7, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 88.54341506958008, - "stage1_fallback_search_ms": 125.9164810180664, - "stage2_expand_ms": 2063.6398792266846, - "stage3_cluster_ms": 0.3476142883300781, - "stage4_rerank_ms": 2633.7506771087646 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 31, - "stage2_unique_paths": 11, - "stage2_duplicate_paths": 20, - "stage3_clustered": 20, - "stage3_strategy": "dir_rr", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 2424.8579000234604, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.06666666666666667, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 6, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "graph expansion", - "staged": { - "strategy": "staged", - "query": "graph expansion", - "latency_ms": 4771.1614000201225, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 61.426401138305664, - "stage1_fallback_search_ms": 152.01711654663086, - "stage2_expand_ms": 2078.4833431243896, - "stage3_cluster_ms": 0.0007152557373046875, - "stage4_rerank_ms": 2376.2998580932617 - }, - "stage_counts": { - "stage1_candidates": 11, - "stage1_fallback_used": 1, - "stage2_expanded": 16, - "stage2_unique_paths": 13, - "stage2_duplicate_paths": 3, - "stage3_clustered": 16, - "stage3_strategy": "dir_rr", - "stage4_reranked": 16 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "graph expansion", - "latency_ms": 2418.981700003147, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1875, - "rbo_topk": 0.06134116970571428, - "staged_unique_files_topk": 9, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 7, - "dense_unique_dirs_topk": 4 - }, - { - "query": "clustering strategy", - "staged": { - "strategy": "staged", - "query": "clustering strategy", - "latency_ms": 4559.269900023937, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 60.93573570251465, - "stage1_fallback_search_ms": 141.4163112640381, - "stage2_expand_ms": 2032.2721004486084, - "stage3_cluster_ms": 0.0007152557373046875, - "stage4_rerank_ms": 2217.2317504882812 - }, - "stage_counts": { - "stage1_candidates": 10, - "stage1_fallback_used": 1, - "stage2_expanded": 10, - "stage2_unique_paths": 10, - "stage2_duplicate_paths": 0, - "stage3_clustered": 10, - "stage3_strategy": "dir_rr", - "stage4_reranked": 10 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "clustering strategy", - "latency_ms": 2443.3700000047684, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.04670528456571428, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 4 - }, - { - "query": "error handling", - "staged": { - "strategy": "staged", - "query": "error handling", - "latency_ms": 4757.269500017166, - "num_results": 6, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 89.56503868103027, - "stage1_fallback_search_ms": 143.58854293823242, - "stage2_expand_ms": 2119.623899459839, - "stage3_cluster_ms": 0.0007152557373046875, - "stage4_rerank_ms": 2303.9650917053223 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 13, - "stage2_unique_paths": 6, - "stage2_duplicate_paths": 7, - "stage3_clustered": 13, - "stage3_strategy": "dir_rr", - "stage4_reranked": 13 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "error handling", - "latency_ms": 2431.0521000623703, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "how to parse json", - "staged": { - "strategy": "staged", - "query": "how to parse json", - "latency_ms": 4611.3456000089645, - "num_results": 7, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 74.86128807067871, - "stage1_fallback_search_ms": 137.465238571167, - "stage2_expand_ms": 2086.426019668579, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 2218.2157039642334 - }, - "stage_counts": { - "stage1_candidates": 4, - "stage1_fallback_used": 1, - "stage2_expanded": 9, - "stage2_unique_paths": 7, - "stage2_duplicate_paths": 2, - "stage3_clustered": 9, - "stage3_strategy": "dir_rr", - "stage4_reranked": 9 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "how to parse json", - "latency_ms": 3155.5130000710487, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.21428571428571427, - "rbo_topk": 0.18590219827714285, - "staged_unique_files_topk": 7, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/compare_2026-02-11_path_fast7.json b/codex-lens/benchmarks/results/compare_2026-02-11_path_fast7.json deleted file mode 100644 index e9a2d65e..00000000 --- a/codex-lens/benchmarks/results/compare_2026-02-11_path_fast7.json +++ /dev/null @@ -1,465 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-11 15:12:41", - "source": "codex-lens\\src", - "k": 10, - "coarse_k": 100, - "query_count": 7, - "avg_jaccard_topk": 0.13455730777159347, - "avg_rbo_topk": 0.10274807844326529, - "staged": { - "success": 7, - "avg_latency_ms": 4532.43382857527 - }, - "dense_rerank": { - "success": 7, - "avg_latency_ms": 2712.3431142909185 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 2704.6869000196457, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 56.32758140563965, - "stage1_fallback_search_ms": 156.8472385406494, - "stage2_expand_ms": 15.436887741088867, - "stage3_cluster_ms": 0.04291534423828125, - "stage4_rerank_ms": 2388.756513595581 - }, - "stage_counts": { - "stage1_candidates": 37, - "stage1_fallback_used": 1, - "stage2_expanded": 86, - "stage2_unique_paths": 53, - "stage2_duplicate_paths": 33, - "stage3_clustered": 20, - "stage3_strategy": "path", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 3257.856599986553, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.2, - "rbo_topk": 0.09948869827714285, - "staged_unique_files_topk": 8, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 4347.2081000208855, - "num_results": 3, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 65.37723541259766, - "stage2_expand_ms": 2145.587682723999, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 2052.9236793518066 - }, - "stage_counts": { - "stage1_candidates": 3, - "stage2_expanded": 4, - "stage2_unique_paths": 3, - "stage2_duplicate_paths": 1, - "stage3_clustered": 4, - "stage3_strategy": "path", - "stage4_reranked": 4 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 2642.404200077057, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.09090909090909091, - "rbo_topk": 0.23541639942571424, - "staged_unique_files_topk": 2, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 4627.254400074482, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 96.67634963989258, - "stage1_fallback_search_ms": 162.25123405456543, - "stage2_expand_ms": 2071.5224742889404, - "stage3_cluster_ms": 0.018835067749023438, - "stage4_rerank_ms": 2211.8191719055176 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 31, - "stage2_unique_paths": 11, - "stage2_duplicate_paths": 20, - "stage3_clustered": 11, - "stage3_strategy": "path", - "stage4_reranked": 11 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 2479.5284999608994, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.06666666666666667, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 6, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "graph expansion", - "staged": { - "strategy": "staged", - "query": "graph expansion", - "latency_ms": 4663.639899969101, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 82.36384391784668, - "stage1_fallback_search_ms": 158.2353115081787, - "stage2_expand_ms": 2087.8846645355225, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 2249.4378089904785 - }, - "stage_counts": { - "stage1_candidates": 11, - "stage1_fallback_used": 1, - "stage2_expanded": 16, - "stage2_unique_paths": 13, - "stage2_duplicate_paths": 3, - "stage3_clustered": 16, - "stage3_strategy": "path", - "stage4_reranked": 16 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "graph expansion", - "latency_ms": 2455.024599969387, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1875, - "rbo_topk": 0.06134116970571428, - "staged_unique_files_topk": 9, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 7, - "dense_unique_dirs_topk": 4 - }, - { - "query": "clustering strategy", - "staged": { - "strategy": "staged", - "query": "clustering strategy", - "latency_ms": 6402.90189999342, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 44.295310974121094, - "stage1_fallback_search_ms": 127.30145454406738, - "stage2_expand_ms": 2030.930995941162, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 4132.822036743164 - }, - "stage_counts": { - "stage1_candidates": 10, - "stage1_fallback_used": 1, - "stage2_expanded": 10, - "stage2_unique_paths": 10, - "stage2_duplicate_paths": 0, - "stage3_clustered": 10, - "stage3_strategy": "path", - "stage4_reranked": 10 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "clustering strategy", - "latency_ms": 3286.4142000079155, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.04670528456571428, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 4 - }, - { - "query": "error handling", - "staged": { - "strategy": "staged", - "query": "error handling", - "latency_ms": 4532.2757999897, - "num_results": 6, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 85.02960205078125, - "stage1_fallback_search_ms": 146.46339416503906, - "stage2_expand_ms": 2071.5532302856445, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 2140.7644748687744 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 13, - "stage2_unique_paths": 6, - "stage2_duplicate_paths": 7, - "stage3_clustered": 13, - "stage3_strategy": "path", - "stage4_reranked": 13 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "error handling", - "latency_ms": 2349.7827999591827, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "how to parse json", - "staged": { - "strategy": "staged", - "query": "how to parse json", - "latency_ms": 4449.06979995966, - "num_results": 7, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 67.15631484985352, - "stage1_fallback_search_ms": 148.30541610717773, - "stage2_expand_ms": 2069.3678855895996, - "stage3_cluster_ms": 0.0007152557373046875, - "stage4_rerank_ms": 2097.882032394409 - }, - "stage_counts": { - "stage1_candidates": 4, - "stage1_fallback_used": 1, - "stage2_expanded": 9, - "stage2_unique_paths": 7, - "stage2_duplicate_paths": 2, - "stage3_clustered": 9, - "stage3_strategy": "path", - "stage4_reranked": 9 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "how to parse json", - "latency_ms": 2515.3909000754356, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.21428571428571427, - "rbo_topk": 0.18590219827714285, - "staged_unique_files_topk": 7, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/compare_2026-02-11_score_fast7.json b/codex-lens/benchmarks/results/compare_2026-02-11_score_fast7.json deleted file mode 100644 index 1ff3e084..00000000 --- a/codex-lens/benchmarks/results/compare_2026-02-11_score_fast7.json +++ /dev/null @@ -1,465 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-11 15:14:25", - "source": "codex-lens\\src", - "k": 10, - "coarse_k": 100, - "query_count": 7, - "avg_jaccard_topk": 0.12384302205730777, - "avg_rbo_topk": 0.09816673566816325, - "staged": { - "success": 7, - "avg_latency_ms": 4538.7477714674815 - }, - "dense_rerank": { - "success": 7, - "avg_latency_ms": 2568.1517999768257 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 2546.395000040531, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 70.5413818359375, - "stage1_fallback_search_ms": 165.39907455444336, - "stage2_expand_ms": 15.58542251586914, - "stage3_cluster_ms": 0.020265579223632812, - "stage4_rerank_ms": 2209.89727973938 - }, - "stage_counts": { - "stage1_candidates": 37, - "stage1_fallback_used": 1, - "stage2_expanded": 86, - "stage2_unique_paths": 53, - "stage2_duplicate_paths": 33, - "stage3_clustered": 20, - "stage3_strategy": "score", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 2610.328099966049, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.125, - "rbo_topk": 0.06741929885142856, - "staged_unique_files_topk": 8, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 4569.872200012207, - "num_results": 3, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 96.31776809692383, - "stage2_expand_ms": 2299.86310005188, - "stage3_cluster_ms": 0.0007152557373046875, - "stage4_rerank_ms": 2094.2182540893555 - }, - "stage_counts": { - "stage1_candidates": 3, - "stage2_expanded": 4, - "stage2_unique_paths": 3, - "stage2_duplicate_paths": 1, - "stage3_clustered": 4, - "stage3_strategy": "score", - "stage4_reranked": 4 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 2509.9732999801636, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.09090909090909091, - "rbo_topk": 0.23541639942571424, - "staged_unique_files_topk": 2, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 5064.990800082684, - "num_results": 6, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 86.1806869506836, - "stage1_fallback_search_ms": 150.21824836730957, - "stage2_expand_ms": 2080.6803703308105, - "stage3_cluster_ms": 0.011682510375976562, - "stage4_rerank_ms": 2663.7954711914062 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 31, - "stage2_unique_paths": 11, - "stage2_duplicate_paths": 20, - "stage3_clustered": 20, - "stage3_strategy": "score", - "stage4_reranked": 20 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 2778.6906000375748, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.06666666666666667, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 6, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 2, - "dense_unique_dirs_topk": 4 - }, - { - "query": "graph expansion", - "staged": { - "strategy": "staged", - "query": "graph expansion", - "latency_ms": 4816.586899995804, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 79.48184013366699, - "stage1_fallback_search_ms": 158.03027153015137, - "stage2_expand_ms": 2087.271213531494, - "stage3_cluster_ms": 0.0007152557373046875, - "stage4_rerank_ms": 2410.567283630371 - }, - "stage_counts": { - "stage1_candidates": 11, - "stage1_fallback_used": 1, - "stage2_expanded": 16, - "stage2_unique_paths": 13, - "stage2_duplicate_paths": 3, - "stage3_clustered": 16, - "stage3_strategy": "score", - "stage4_reranked": 16 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "graph expansion", - "latency_ms": 2692.1504999399185, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1875, - "rbo_topk": 0.06134116970571428, - "staged_unique_files_topk": 9, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 7, - "dense_unique_dirs_topk": 4 - }, - { - "query": "clustering strategy", - "staged": { - "strategy": "staged", - "query": "clustering strategy", - "latency_ms": 4494.9805000424385, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 40.569305419921875, - "stage1_fallback_search_ms": 141.06035232543945, - "stage2_expand_ms": 2043.9364910125732, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 2198.4200477600098 - }, - "stage_counts": { - "stage1_candidates": 10, - "stage1_fallback_used": 1, - "stage2_expanded": 10, - "stage2_unique_paths": 10, - "stage2_duplicate_paths": 0, - "stage3_clustered": 10, - "stage3_strategy": "score", - "stage4_reranked": 10 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "clustering strategy", - "latency_ms": 2474.2726999521255, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.04670528456571428, - "staged_unique_files_topk": 10, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 4 - }, - { - "query": "error handling", - "staged": { - "strategy": "staged", - "query": "error handling", - "latency_ms": 5652.523400068283, - "num_results": 6, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 87.34393119812012, - "stage1_fallback_search_ms": 149.7325897216797, - "stage2_expand_ms": 2072.728157043457, - "stage3_cluster_ms": 0.00095367431640625, - "stage4_rerank_ms": 3190.687894821167 - }, - "stage_counts": { - "stage1_candidates": 5, - "stage1_fallback_used": 1, - "stage2_expanded": 13, - "stage2_unique_paths": 6, - "stage2_duplicate_paths": 7, - "stage3_clustered": 13, - "stage3_strategy": "score", - "stage4_reranked": 13 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "error handling", - "latency_ms": 2481.709800004959, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.07142857142857142, - "rbo_topk": 0.045191399425714276, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 4 - }, - { - "query": "how to parse json", - "staged": { - "strategy": "staged", - "query": "how to parse json", - "latency_ms": 4625.885600030422, - "num_results": 7, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 92.83590316772461, - "stage1_fallback_search_ms": 147.12858200073242, - "stage2_expand_ms": 2061.2568855285645, - "stage3_cluster_ms": 0.0011920928955078125, - "stage4_rerank_ms": 2246.800184249878 - }, - "stage_counts": { - "stage1_candidates": 4, - "stage1_fallback_used": 1, - "stage2_expanded": 9, - "stage2_unique_paths": 7, - "stage2_duplicate_paths": 2, - "stage3_clustered": 9, - "stage3_strategy": "score", - "stage4_reranked": 9 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "how to parse json", - "latency_ms": 2429.9375999569893, - "num_results": 10, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.21428571428571427, - "rbo_topk": 0.18590219827714285, - "staged_unique_files_topk": 7, - "dense_unique_files_topk": 10, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 4 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/method_contribution_analysis.json b/codex-lens/benchmarks/results/method_contribution_analysis.json deleted file mode 100644 index f192b4fa..00000000 --- a/codex-lens/benchmarks/results/method_contribution_analysis.json +++ /dev/null @@ -1,406 +0,0 @@ -{ - "storage_analysis": { - "tables": { - "code_relationships": { - "row_count": 0, - "columns": [ - "id", - "source_symbol_id", - "target_qualified_name", - "relationship_type", - "source_line", - "target_file" - ] - }, - "embeddings_config": { - "row_count": 1, - "columns": [ - "id", - "model_profile", - "model_name", - "embedding_dim", - "backend", - "created_at", - "updated_at" - ] - }, - "file_keywords": { - "row_count": 0, - "columns": [ - "file_id", - "keyword_id" - ] - }, - "files": { - "row_count": 0, - "columns": [ - "id", - "name", - "full_path", - "language", - "content", - "mtime", - "line_count" - ] - }, - "files_fts_exact": { - "row_count": 0, - "columns": [ - "name", - "full_path", - "content" - ] - }, - "files_fts_exact_config": { - "row_count": 1, - "columns": [ - "k", - "v" - ] - }, - "files_fts_exact_data": { - "row_count": 2, - "columns": [ - "id", - "block" - ] - }, - "files_fts_exact_docsize": { - "row_count": 0, - "columns": [ - "id", - "sz" - ] - }, - "files_fts_exact_idx": { - "row_count": 0, - "columns": [ - "segid", - "term", - "pgno" - ] - }, - "files_fts_fuzzy": { - "row_count": 0, - "columns": [ - "name", - "full_path", - "content" - ] - }, - "files_fts_fuzzy_config": { - "row_count": 1, - "columns": [ - "k", - "v" - ] - }, - "files_fts_fuzzy_data": { - "row_count": 2, - "columns": [ - "id", - "block" - ] - }, - "files_fts_fuzzy_docsize": { - "row_count": 0, - "columns": [ - "id", - "sz" - ] - }, - "files_fts_fuzzy_idx": { - "row_count": 0, - "columns": [ - "segid", - "term", - "pgno" - ] - }, - "graph_neighbors": { - "row_count": 0, - "columns": [ - "source_symbol_id", - "neighbor_symbol_id", - "relationship_depth" - ] - }, - "keywords": { - "row_count": 0, - "columns": [ - "id", - "keyword" - ] - }, - "merkle_hashes": { - "row_count": 0, - "columns": [ - "file_id", - "sha256", - "updated_at" - ] - }, - "merkle_state": { - "row_count": 1, - "columns": [ - "id", - "root_hash", - "updated_at" - ] - }, - "semantic_chunks": { - "row_count": 0, - "columns": [ - "id", - "file_path", - "content", - "embedding", - "metadata", - "created_at", - "embedding_binary", - "embedding_dense" - ] - }, - "semantic_metadata": { - "row_count": 0, - "columns": [ - "id", - "file_id", - "summary", - "purpose", - "llm_tool", - "generated_at" - ] - }, - "sqlite_sequence": { - "row_count": 0, - "columns": [ - "name", - "seq" - ] - }, - "subdirs": { - "row_count": 2, - "columns": [ - "id", - "name", - "index_path", - "files_count", - "last_updated" - ] - }, - "symbols": { - "row_count": 0, - "columns": [ - "id", - "file_id", - "name", - "kind", - "start_line", - "end_line" - ] - } - }, - "conflicts": [], - "recommendations": [ - "Found 10 FTS tables: ['files_fts_exact', 'files_fts_exact_config', 'files_fts_exact_data', 'files_fts_exact_docsize', 'files_fts_exact_idx', 'files_fts_fuzzy', 'files_fts_fuzzy_config', 'files_fts_fuzzy_data', 'files_fts_fuzzy_docsize', 'files_fts_fuzzy_idx']. Dual FTS (exact + fuzzy) is properly configured." - ] - }, - "contribution_analysis": { - "per_query": [ - { - "query": "binary quantization", - "methods": { - "fts_exact": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - }, - "fts_fuzzy": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - }, - "vector": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - }, - "splade": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - } - }, - "fusion_analysis": {}, - "overlaps": {} - }, - { - "query": "hamming distance search", - "methods": { - "fts_exact": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - }, - "fts_fuzzy": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - }, - "vector": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - }, - "splade": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - } - }, - "fusion_analysis": {}, - "overlaps": {} - }, - { - "query": "embeddings generation", - "methods": { - "fts_exact": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - }, - "fts_fuzzy": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - }, - "vector": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - }, - "splade": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - } - }, - "fusion_analysis": {}, - "overlaps": {} - }, - { - "query": "reranking algorithm", - "methods": { - "fts_exact": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - }, - "fts_fuzzy": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - }, - "vector": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - }, - "splade": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - } - }, - "fusion_analysis": {}, - "overlaps": {} - }, - { - "query": "database connection handling", - "methods": { - "fts_exact": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - }, - "fts_fuzzy": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - }, - "vector": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - }, - "splade": { - "error": "'obj' object has no attribute 'symbol_boost_factor'", - "count": 0 - } - }, - "fusion_analysis": {}, - "overlaps": {} - } - ], - "summary": { - "fts_exact": { - "avg_count": 0.0, - "avg_latency_ms": 0 - }, - "fts_fuzzy": { - "avg_count": 0.0, - "avg_latency_ms": 0 - }, - "vector": { - "avg_count": 0.0, - "avg_latency_ms": 0 - }, - "splade": { - "avg_count": 0.0, - "avg_latency_ms": 0 - } - } - }, - "fusion_experiment": { - "per_query": [ - { - "query": "binary quantization", - "strategies": { - "standard_hybrid": { - "error": "'obj' object has no attribute 'symbol_boost_factor'" - }, - "fts_rerank_fusion": { - "error": "'obj' object has no attribute 'symbol_boost_factor'" - } - } - }, - { - "query": "hamming distance search", - "strategies": { - "standard_hybrid": { - "error": "'obj' object has no attribute 'symbol_boost_factor'" - }, - "fts_rerank_fusion": { - "error": "'obj' object has no attribute 'symbol_boost_factor'" - } - } - }, - { - "query": "embeddings generation", - "strategies": { - "standard_hybrid": { - "error": "'obj' object has no attribute 'symbol_boost_factor'" - }, - "fts_rerank_fusion": { - "error": "'obj' object has no attribute 'symbol_boost_factor'" - } - } - }, - { - "query": "reranking algorithm", - "strategies": { - "standard_hybrid": { - "error": "'obj' object has no attribute 'symbol_boost_factor'" - }, - "fts_rerank_fusion": { - "error": "'obj' object has no attribute 'symbol_boost_factor'" - } - } - }, - { - "query": "database connection handling", - "strategies": { - "standard_hybrid": { - "error": "'obj' object has no attribute 'symbol_boost_factor'" - }, - "fts_rerank_fusion": { - "error": "'obj' object has no attribute 'symbol_boost_factor'" - } - } - } - ], - "summary": {} - } -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/tmp_compare1.json b/codex-lens/benchmarks/results/tmp_compare1.json deleted file mode 100644 index d8b14058..00000000 --- a/codex-lens/benchmarks/results/tmp_compare1.json +++ /dev/null @@ -1,73 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-08 23:48:26", - "source": "src", - "k": 5, - "coarse_k": 50, - "query_count": 1, - "avg_jaccard_topk": 0.0, - "avg_rbo_topk": 0.0, - "staged": { - "success": 1, - "avg_latency_ms": 30093.97499999404 - }, - "dense_rerank": { - "success": 1, - "avg_latency_ms": 331.4424999952316 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 30093.97499999404, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 6421.706914901733, - "stage2_expand_ms": 17591.988563537598, - "stage3_cluster_ms": 3700.4549503326416, - "stage4_rerank_ms": 2340.064525604248 - }, - "stage_counts": { - "stage1_candidates": 50, - "stage2_expanded": 99, - "stage3_clustered": 10, - "stage4_reranked": 10 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 331.4424999952316, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.0, - "rbo_topk": 0.0, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 5, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 1 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/tmp_compare3_ok_cpu.json b/codex-lens/benchmarks/results/tmp_compare3_ok_cpu.json deleted file mode 100644 index 550e0b19..00000000 --- a/codex-lens/benchmarks/results/tmp_compare3_ok_cpu.json +++ /dev/null @@ -1,177 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-08 23:58:56", - "source": "src", - "k": 5, - "coarse_k": 50, - "query_count": 3, - "avg_jaccard_topk": 0.11574074074074074, - "avg_rbo_topk": 0.14601366666666662, - "staged": { - "success": 3, - "avg_latency_ms": 27868.044033328693 - }, - "dense_rerank": { - "success": 3, - "avg_latency_ms": 1339.25289999942 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 33643.06179998815, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 6201.4524936676025, - "stage2_expand_ms": 17306.61702156067, - "stage3_cluster_ms": 6829.557418823242, - "stage4_rerank_ms": 3267.071485519409 - }, - "stage_counts": { - "stage1_candidates": 50, - "stage2_expanded": 99, - "stage3_clustered": 10, - "stage4_reranked": 10 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 1520.9955999851227, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.031347, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 5, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 1 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 26400.58900000155, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 404.60920333862305, - "stage2_expand_ms": 20036.258697509766, - "stage3_cluster_ms": 4919.439315795898, - "stage4_rerank_ms": 1001.8632411956787 - }, - "stage_counts": { - "stage1_candidates": 50, - "stage2_expanded": 51, - "stage3_clustered": 10, - "stage4_reranked": 10 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 1264.3862999975681, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.125, - "rbo_topk": 0.20334699999999994, - "staged_unique_files_topk": 4, - "dense_unique_files_topk": 5, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 2 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 23560.481299996376, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 385.28990745544434, - "stage2_expand_ms": 17787.648677825928, - "stage3_cluster_ms": 4374.642372131348, - "stage4_rerank_ms": 974.8115539550781 - }, - "stage_counts": { - "stage1_candidates": 50, - "stage2_expanded": 50, - "stage3_clustered": 10, - "stage4_reranked": 10 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 1232.3768000155687, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.20334699999999994, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 5, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 1 - } - ] -} \ No newline at end of file diff --git a/codex-lens/benchmarks/results/tmp_compare3_ok_cpu_dedup.json b/codex-lens/benchmarks/results/tmp_compare3_ok_cpu_dedup.json deleted file mode 100644 index 26c837ec..00000000 --- a/codex-lens/benchmarks/results/tmp_compare3_ok_cpu_dedup.json +++ /dev/null @@ -1,176 +0,0 @@ -{ - "summary": { - "timestamp": "2026-02-09 00:08:47", - "source": "src", - "k": 5, - "coarse_k": 50, - "query_count": 3, - "avg_jaccard_topk": 0.11574074074074074, - "avg_rbo_topk": 0.14601366666666662, - "staged": { - "success": 3, - "avg_latency_ms": 31720.555866663653 - }, - "dense_rerank": { - "success": 3, - "avg_latency_ms": 1401.2113333245118 - } - }, - "comparisons": [ - { - "query": "class Config", - "staged": { - "strategy": "staged", - "query": "class Config", - "latency_ms": 40162.88519999385, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 6091.366767883301, - "stage2_expand_ms": 17540.942907333374, - "stage3_cluster_ms": 13169.558048248291, - "stage4_rerank_ms": 3317.5392150878906 - }, - "stage_counts": { - "stage1_candidates": 50, - "stage2_expanded": 99, - "stage3_clustered": 10, - "stage4_reranked": 10 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "class Config", - "latency_ms": 1571.1398999989033, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.031347, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 5, - "staged_unique_dirs_topk": 5, - "dense_unique_dirs_topk": 1 - }, - { - "query": "def search", - "staged": { - "strategy": "staged", - "query": "def search", - "latency_ms": 31623.380899995565, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 400.84290504455566, - "stage2_expand_ms": 20529.58631515503, - "stage3_cluster_ms": 9625.348806381226, - "stage4_rerank_ms": 1027.686357498169 - }, - "stage_counts": { - "stage1_candidates": 50, - "stage2_expanded": 51, - "stage3_clustered": 10, - "stage4_reranked": 10 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "def search", - "latency_ms": 1376.3304999768734, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.125, - "rbo_topk": 0.20334699999999994, - "staged_unique_files_topk": 4, - "dense_unique_files_topk": 5, - "staged_unique_dirs_topk": 3, - "dense_unique_dirs_topk": 2 - }, - { - "query": "LspBridge", - "staged": { - "strategy": "staged", - "query": "LspBridge", - "latency_ms": 23375.40150000155, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py" - ], - "stage_stats": { - "stage_times": { - "stage1_binary_ms": 392.41671562194824, - "stage2_expand_ms": 17760.897397994995, - "stage3_cluster_ms": 4194.235563278198, - "stage4_rerank_ms": 990.307092666626 - }, - "stage_counts": { - "stage1_candidates": 50, - "stage2_expanded": 50, - "stage3_clustered": 10, - "stage4_reranked": 10 - } - }, - "error": null - }, - "dense_rerank": { - "strategy": "dense_rerank", - "query": "LspBridge", - "latency_ms": 1256.1635999977589, - "num_results": 5, - "topk_paths": [ - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py", - "d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py" - ], - "stage_stats": null, - "error": null - }, - "jaccard_topk": 0.1111111111111111, - "rbo_topk": 0.20334699999999994, - "staged_unique_files_topk": 5, - "dense_unique_files_topk": 5, - "staged_unique_dirs_topk": 4, - "dense_unique_dirs_topk": 1 - } - ] -} \ No newline at end of file diff --git a/codex-lens/coir_benchmark_full.py b/codex-lens/coir_benchmark_full.py deleted file mode 100644 index 193b6043..00000000 --- a/codex-lens/coir_benchmark_full.py +++ /dev/null @@ -1,465 +0,0 @@ -""" -CoIR Benchmark Evaluation Report Generator - -Compares SPLADE with mainstream code retrieval models on CoIR benchmark tasks. -Generates comprehensive performance analysis report. -""" -import sys -import time -import json -from pathlib import Path -from datetime import datetime -from typing import Dict, List, Tuple -import numpy as np - -sys.path.insert(0, 'src') - -# ============================================================================= -# REFERENCE: Published CoIR Benchmark Scores (NDCG@10) -# Source: CoIR Paper (ACL 2025) - https://arxiv.org/abs/2407.02883 -# ============================================================================= - -COIR_REFERENCE_SCORES = { - # Model: {dataset: NDCG@10 score} - "Voyage-Code-002": { - "APPS": 26.52, "CosQA": 29.79, "Text2SQL": 69.26, "CodeSearchNet": 81.79, - "CCR": 73.45, "Contest-DL": 72.77, "StackOverflow": 27.28, - "FB-ST": 87.68, "FB-MT": 65.35, "Average": 56.26 - }, - "E5-Mistral-7B": { - "APPS": 21.33, "CosQA": 31.27, "Text2SQL": 65.98, "CodeSearchNet": 54.25, - "CCR": 65.27, "Contest-DL": 82.55, "StackOverflow": 33.24, - "FB-ST": 91.54, "FB-MT": 72.71, "Average": 55.18 - }, - "E5-Base": { - "APPS": 11.52, "CosQA": 32.59, "Text2SQL": 52.31, "CodeSearchNet": 67.99, - "CCR": 56.87, "Contest-DL": 62.50, "StackOverflow": 21.87, - "FB-ST": 86.86, "FB-MT": 74.52, "Average": 50.90 - }, - "OpenAI-Ada-002": { - "APPS": 8.70, "CosQA": 28.88, "Text2SQL": 58.32, "CodeSearchNet": 74.21, - "CCR": 69.13, "Contest-DL": 53.34, "StackOverflow": 26.04, - "FB-ST": 72.40, "FB-MT": 47.12, "Average": 45.59 - }, - "BGE-Base": { - "APPS": 4.05, "CosQA": 32.76, "Text2SQL": 45.59, "CodeSearchNet": 69.60, - "CCR": 45.56, "Contest-DL": 38.50, "StackOverflow": 21.71, - "FB-ST": 73.55, "FB-MT": 64.99, "Average": 42.77 - }, - "BGE-M3": { - "APPS": 7.37, "CosQA": 22.73, "Text2SQL": 48.76, "CodeSearchNet": 43.23, - "CCR": 47.55, "Contest-DL": 47.86, "StackOverflow": 31.16, - "FB-ST": 61.04, "FB-MT": 49.94, "Average": 39.31 - }, - "UniXcoder": { - "APPS": 1.36, "CosQA": 25.14, "Text2SQL": 50.45, "CodeSearchNet": 60.20, - "CCR": 58.36, "Contest-DL": 41.82, "StackOverflow": 31.03, - "FB-ST": 44.67, "FB-MT": 36.02, "Average": 37.33 - }, - "GTE-Base": { - "APPS": 3.24, "CosQA": 30.24, "Text2SQL": 46.19, "CodeSearchNet": 43.35, - "CCR": 35.50, "Contest-DL": 33.81, "StackOverflow": 28.80, - "FB-ST": 62.71, "FB-MT": 55.19, "Average": 36.75 - }, - "Contriever": { - "APPS": 5.14, "CosQA": 14.21, "Text2SQL": 45.46, "CodeSearchNet": 34.72, - "CCR": 35.74, "Contest-DL": 44.16, "StackOverflow": 24.21, - "FB-ST": 66.05, "FB-MT": 55.11, "Average": 36.40 - }, -} - -# Recent models (2025) -RECENT_MODELS = { - "Voyage-Code-3": {"Average": 62.5, "note": "13.8% better than OpenAI-v3-large"}, - "SFR-Embedding-Code-7B": {"Average": 67.4, "note": "#1 on CoIR (Feb 2025)"}, - "Jina-Code-v2": {"CosQA": 41.0, "note": "Strong on CosQA"}, - "CodeSage-Large": {"Average": 53.5, "note": "Specialized code model"}, -} - - -# ============================================================================= -# TEST DATA: Synthetic CoIR-like datasets for local evaluation -# ============================================================================= - -def create_test_datasets(): - """Create synthetic test datasets mimicking CoIR task types.""" - - # Text-to-Code (like CosQA, CodeSearchNet) - text_to_code = { - "name": "Text-to-Code", - "description": "Natural language queries to code snippets", - "corpus": [ - {"id": "c1", "text": "def authenticate_user(username: str, password: str) -> bool:\n user = db.get_user(username)\n if user and verify_hash(password, user.password_hash):\n return True\n return False"}, - {"id": "c2", "text": "async function fetchUserData(userId) {\n const response = await fetch(`/api/users/${userId}`);\n if (!response.ok) throw new Error('User not found');\n return response.json();\n}"}, - {"id": "c3", "text": "def calculate_statistics(data: List[float]) -> Dict[str, float]:\n return {\n 'mean': np.mean(data),\n 'std': np.std(data),\n 'median': np.median(data)\n }"}, - {"id": "c4", "text": "SELECT u.id, u.name, u.email, COUNT(o.id) as order_count\nFROM users u LEFT JOIN orders o ON u.id = o.user_id\nWHERE u.status = 'active'\nGROUP BY u.id, u.name, u.email"}, - {"id": "c5", "text": "def merge_sort(arr: List[int]) -> List[int]:\n if len(arr) <= 1:\n return arr\n mid = len(arr) // 2\n left = merge_sort(arr[:mid])\n right = merge_sort(arr[mid:])\n return merge(left, right)"}, - {"id": "c6", "text": "app.post('/api/auth/login', async (req, res) => {\n const { email, password } = req.body;\n const user = await User.findByEmail(email);\n if (!user || !await bcrypt.compare(password, user.password)) {\n return res.status(401).json({ error: 'Invalid credentials' });\n }\n const token = jwt.sign({ userId: user.id }, process.env.JWT_SECRET);\n res.json({ token });\n});"}, - {"id": "c7", "text": "CREATE TABLE products (\n id SERIAL PRIMARY KEY,\n name VARCHAR(255) NOT NULL,\n price DECIMAL(10, 2) NOT NULL,\n category_id INTEGER REFERENCES categories(id),\n created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n);"}, - {"id": "c8", "text": "def read_json_file(filepath: str) -> Dict:\n with open(filepath, 'r', encoding='utf-8') as f:\n return json.load(f)"}, - {"id": "c9", "text": "class UserRepository:\n def __init__(self, session):\n self.session = session\n \n def find_by_email(self, email: str) -> Optional[User]:\n return self.session.query(User).filter(User.email == email).first()"}, - {"id": "c10", "text": "try:\n result = await process_data(input_data)\nexcept ValidationError as e:\n logger.error(f'Validation failed: {e}')\n raise HTTPException(status_code=400, detail=str(e))\nexcept DatabaseError as e:\n logger.critical(f'Database error: {e}')\n raise HTTPException(status_code=500, detail='Internal server error')"}, - ], - "queries": [ - {"id": "q1", "text": "function to verify user password and authenticate", "relevant": ["c1", "c6"]}, - {"id": "q2", "text": "async http request to fetch user data", "relevant": ["c2"]}, - {"id": "q3", "text": "calculate mean median standard deviation statistics", "relevant": ["c3"]}, - {"id": "q4", "text": "SQL query join users and orders count", "relevant": ["c4", "c7"]}, - {"id": "q5", "text": "recursive sorting algorithm implementation", "relevant": ["c5"]}, - {"id": "q6", "text": "REST API login endpoint with JWT token", "relevant": ["c6", "c1"]}, - {"id": "q7", "text": "create database table with foreign key", "relevant": ["c7"]}, - {"id": "q8", "text": "read and parse JSON file python", "relevant": ["c8"]}, - {"id": "q9", "text": "repository pattern find user by email", "relevant": ["c9", "c1"]}, - {"id": "q10", "text": "exception handling with logging", "relevant": ["c10"]}, - ] - } - - # Code-to-Code (like CCR) - code_to_code = { - "name": "Code-to-Code", - "description": "Find similar code implementations", - "corpus": [ - {"id": "c1", "text": "def add(a, b): return a + b"}, - {"id": "c2", "text": "function sum(x, y) { return x + y; }"}, - {"id": "c3", "text": "func add(a int, b int) int { return a + b }"}, - {"id": "c4", "text": "def subtract(a, b): return a - b"}, - {"id": "c5", "text": "def multiply(a, b): return a * b"}, - {"id": "c6", "text": "const add = (a, b) => a + b;"}, - {"id": "c7", "text": "fn add(a: i32, b: i32) -> i32 { a + b }"}, - {"id": "c8", "text": "public int add(int a, int b) { return a + b; }"}, - ], - "queries": [ - {"id": "q1", "text": "def add(a, b): return a + b", "relevant": ["c1", "c2", "c3", "c6", "c7", "c8"]}, - {"id": "q2", "text": "def subtract(x, y): return x - y", "relevant": ["c4"]}, - {"id": "q3", "text": "def mult(x, y): return x * y", "relevant": ["c5"]}, - ] - } - - # Text2SQL - text2sql = { - "name": "Text2SQL", - "description": "Natural language to SQL queries", - "corpus": [ - {"id": "c1", "text": "SELECT * FROM users WHERE active = 1"}, - {"id": "c2", "text": "SELECT COUNT(*) FROM orders WHERE status = 'pending'"}, - {"id": "c3", "text": "SELECT u.name, SUM(o.total) FROM users u JOIN orders o ON u.id = o.user_id GROUP BY u.name"}, - {"id": "c4", "text": "UPDATE products SET price = price * 1.1 WHERE category = 'electronics'"}, - {"id": "c5", "text": "DELETE FROM sessions WHERE expires_at < NOW()"}, - {"id": "c6", "text": "INSERT INTO users (name, email) VALUES ('John', 'john@example.com')"}, - ], - "queries": [ - {"id": "q1", "text": "get all active users", "relevant": ["c1"]}, - {"id": "q2", "text": "count pending orders", "relevant": ["c2"]}, - {"id": "q3", "text": "total order amount by user", "relevant": ["c3"]}, - {"id": "q4", "text": "increase electronics prices by 10%", "relevant": ["c4"]}, - {"id": "q5", "text": "remove expired sessions", "relevant": ["c5"]}, - {"id": "q6", "text": "add new user", "relevant": ["c6"]}, - ] - } - - return [text_to_code, code_to_code, text2sql] - - -# ============================================================================= -# EVALUATION FUNCTIONS -# ============================================================================= - -def ndcg_at_k(ranked_list: List[str], relevant: List[str], k: int = 10) -> float: - """Calculate NDCG@k.""" - dcg = 0.0 - for i, doc_id in enumerate(ranked_list[:k]): - if doc_id in relevant: - dcg += 1.0 / np.log2(i + 2) - - # Ideal DCG - ideal_k = min(len(relevant), k) - idcg = sum(1.0 / np.log2(i + 2) for i in range(ideal_k)) - - return dcg / idcg if idcg > 0 else 0.0 - - -def precision_at_k(ranked_list: List[str], relevant: List[str], k: int = 10) -> float: - """Calculate Precision@k.""" - retrieved = set(ranked_list[:k]) - relevant_set = set(relevant) - return len(retrieved & relevant_set) / k - - -def recall_at_k(ranked_list: List[str], relevant: List[str], k: int = 10) -> float: - """Calculate Recall@k.""" - retrieved = set(ranked_list[:k]) - relevant_set = set(relevant) - return len(retrieved & relevant_set) / len(relevant_set) if relevant_set else 0.0 - - -def mrr(ranked_list: List[str], relevant: List[str]) -> float: - """Calculate Mean Reciprocal Rank.""" - for i, doc_id in enumerate(ranked_list): - if doc_id in relevant: - return 1.0 / (i + 1) - return 0.0 - - -def evaluate_model(model_name: str, encode_fn, datasets: List[Dict]) -> Dict: - """Evaluate a model on all datasets.""" - results = {} - - for dataset in datasets: - corpus = dataset["corpus"] - queries = dataset["queries"] - - corpus_ids = [doc["id"] for doc in corpus] - corpus_texts = [doc["text"] for doc in corpus] - corpus_embs = encode_fn(corpus_texts) - - metrics = {"ndcg@10": [], "precision@10": [], "recall@10": [], "mrr": []} - - for query in queries: - query_emb = encode_fn([query["text"]])[0] - - # Compute similarity scores - if hasattr(corpus_embs, 'shape') and len(corpus_embs.shape) == 2: - # Dense vectors - cosine similarity - q_norm = query_emb / (np.linalg.norm(query_emb) + 1e-8) - c_norm = corpus_embs / (np.linalg.norm(corpus_embs, axis=1, keepdims=True) + 1e-8) - scores = np.dot(c_norm, q_norm) - else: - # Sparse - dot product - scores = np.array([np.dot(c, query_emb) for c in corpus_embs]) - - ranked_indices = np.argsort(scores)[::-1] - ranked_ids = [corpus_ids[i] for i in ranked_indices] - relevant = query["relevant"] - - metrics["ndcg@10"].append(ndcg_at_k(ranked_ids, relevant, 10)) - metrics["precision@10"].append(precision_at_k(ranked_ids, relevant, 10)) - metrics["recall@10"].append(recall_at_k(ranked_ids, relevant, 10)) - metrics["mrr"].append(mrr(ranked_ids, relevant)) - - results[dataset["name"]] = {k: np.mean(v) * 100 for k, v in metrics.items()} - - # Calculate average - all_ndcg = [results[d["name"]]["ndcg@10"] for d in datasets] - results["Average"] = { - "ndcg@10": np.mean(all_ndcg), - "note": "Average across all datasets" - } - - return results - - -# ============================================================================= -# MODEL IMPLEMENTATIONS -# ============================================================================= - -def get_splade_encoder(): - """Get SPLADE encoding function.""" - from codexlens.semantic.splade_encoder import get_splade_encoder as _get_splade - encoder = _get_splade() - - def encode(texts): - sparse_vecs = encoder.encode_batch(texts) if len(texts) > 1 else [encoder.encode_text(texts[0])] - # Convert to dense for comparison - vocab_size = encoder.vocab_size - dense = np.zeros((len(sparse_vecs), vocab_size), dtype=np.float32) - for i, sv in enumerate(sparse_vecs): - for tid, w in sv.items(): - dense[i, tid] = w - return dense - - return encode - - -def get_dense_encoder(model_name: str = "all-MiniLM-L6-v2"): - """Get dense embedding encoding function.""" - from sentence_transformers import SentenceTransformer - model = SentenceTransformer(model_name) - - def encode(texts): - return model.encode(texts, show_progress_bar=False) - - return encode - - -# ============================================================================= -# REPORT GENERATION -# ============================================================================= - -def generate_report(local_results: Dict, output_path: str = None): - """Generate comprehensive benchmark report.""" - - report = [] - report.append("=" * 80) - report.append("CODE RETRIEVAL BENCHMARK REPORT") - report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") - report.append("=" * 80) - - # Section 1: Reference Benchmark Scores - report.append("\n## 1. CoIR Benchmark Reference Scores (Published)") - report.append("\nSource: CoIR Paper (ACL 2025) - https://arxiv.org/abs/2407.02883") - report.append("\n### NDCG@10 Scores by Model and Dataset\n") - - # Header - datasets = ["APPS", "CosQA", "Text2SQL", "CodeSearchNet", "CCR", "Contest-DL", "StackOverflow", "FB-ST", "FB-MT", "Average"] - header = "| Model | " + " | ".join(datasets) + " |" - separator = "|" + "|".join(["---"] * (len(datasets) + 1)) + "|" - report.append(header) - report.append(separator) - - # Data rows - for model, scores in COIR_REFERENCE_SCORES.items(): - row = f"| {model} | " + " | ".join([f"{scores.get(d, '-'):.2f}" if isinstance(scores.get(d), (int, float)) else str(scores.get(d, '-')) for d in datasets]) + " |" - report.append(row) - - # Section 2: Recent Models - report.append("\n### Recent Top Performers (2025)\n") - report.append("| Model | Average NDCG@10 | Notes |") - report.append("|-------|-----------------|-------|") - for model, info in RECENT_MODELS.items(): - avg = info.get("Average", "-") - note = info.get("note", "") - report.append(f"| {model} | {avg} | {note} |") - - # Section 3: Local Evaluation Results - report.append("\n## 2. Local Evaluation Results\n") - report.append("Evaluated on synthetic CoIR-like datasets\n") - - for model_name, results in local_results.items(): - report.append(f"\n### {model_name}\n") - report.append("| Dataset | NDCG@10 | Precision@10 | Recall@10 | MRR |") - report.append("|---------|---------|--------------|-----------|-----|") - for dataset_name, metrics in results.items(): - if dataset_name == "Average": - continue - ndcg = metrics.get("ndcg@10", 0) - prec = metrics.get("precision@10", 0) - rec = metrics.get("recall@10", 0) - m = metrics.get("mrr", 0) - report.append(f"| {dataset_name} | {ndcg:.2f} | {prec:.2f} | {rec:.2f} | {m:.2f} |") - - if "Average" in results: - avg = results["Average"]["ndcg@10"] - report.append(f"| **Average** | **{avg:.2f}** | - | - | - |") - - # Section 4: Comparison Analysis - report.append("\n## 3. Comparison Analysis\n") - - if "SPLADE" in local_results and "Dense (MiniLM)" in local_results: - splade_avg = local_results["SPLADE"]["Average"]["ndcg@10"] - dense_avg = local_results["Dense (MiniLM)"]["Average"]["ndcg@10"] - - report.append("### SPLADE vs Dense Embedding\n") - report.append(f"- SPLADE Average NDCG@10: {splade_avg:.2f}") - report.append(f"- Dense (MiniLM) Average NDCG@10: {dense_avg:.2f}") - - if splade_avg > dense_avg: - diff = ((splade_avg - dense_avg) / dense_avg) * 100 - report.append(f"- SPLADE outperforms by {diff:.1f}%") - else: - diff = ((dense_avg - splade_avg) / splade_avg) * 100 - report.append(f"- Dense outperforms by {diff:.1f}%") - - # Section 5: Key Insights - report.append("\n## 4. Key Insights\n") - report.append(""" -1. **Voyage-Code-002** achieved highest mean score (56.26) on original CoIR benchmark -2. **SFR-Embedding-Code-7B** (Salesforce) reached #1 in Feb 2025 with 67.4 average -3. **SPLADE** provides good balance of: - - Interpretability (visible token activations) - - Query expansion (learned synonyms) - - Efficient sparse retrieval - -4. **Task-specific performance varies significantly**: - - E5-Mistral excels at Contest-DL (82.55) but median on APPS - - Voyage-Code-002 excels at CodeSearchNet (81.79) - - No single model dominates all tasks - -5. **Hybrid approaches recommended**: - - Combine sparse (SPLADE/BM25) with dense for best results - - Use RRF (Reciprocal Rank Fusion) for score combination -""") - - # Section 6: Recommendations - report.append("\n## 5. Recommendations for Codex-lens\n") - report.append(""" -| Use Case | Recommended Approach | -|----------|---------------------| -| General code search | SPLADE + Dense hybrid | -| Exact keyword match | FTS (BM25) | -| Semantic understanding | Dense embedding | -| Interpretable results | SPLADE only | -| Maximum accuracy | SFR-Embedding-Code + SPLADE fusion | -""") - - report_text = "\n".join(report) - - if output_path: - with open(output_path, 'w', encoding='utf-8') as f: - f.write(report_text) - print(f"Report saved to: {output_path}") - - return report_text - - -# ============================================================================= -# MAIN -# ============================================================================= - -def main(): - print("=" * 80) - print("CODE RETRIEVAL BENCHMARK EVALUATION") - print("=" * 80) - - # Create test datasets - print("\nCreating test datasets...") - datasets = create_test_datasets() - print(f" Created {len(datasets)} datasets") - - local_results = {} - - # Evaluate SPLADE - print("\nEvaluating SPLADE...") - try: - from codexlens.semantic.splade_encoder import check_splade_available - ok, err = check_splade_available() - if ok: - start = time.perf_counter() - splade_encode = get_splade_encoder() - splade_results = evaluate_model("SPLADE", splade_encode, datasets) - elapsed = time.perf_counter() - start - local_results["SPLADE"] = splade_results - print(f" SPLADE evaluated in {elapsed:.2f}s") - print(f" Average NDCG@10: {splade_results['Average']['ndcg@10']:.2f}") - else: - print(f" SPLADE not available: {err}") - except Exception as e: - print(f" SPLADE evaluation failed: {e}") - - # Evaluate Dense (MiniLM) - print("\nEvaluating Dense (all-MiniLM-L6-v2)...") - try: - start = time.perf_counter() - dense_encode = get_dense_encoder("all-MiniLM-L6-v2") - dense_results = evaluate_model("Dense (MiniLM)", dense_encode, datasets) - elapsed = time.perf_counter() - start - local_results["Dense (MiniLM)"] = dense_results - print(f" Dense evaluated in {elapsed:.2f}s") - print(f" Average NDCG@10: {dense_results['Average']['ndcg@10']:.2f}") - except Exception as e: - print(f" Dense evaluation failed: {e}") - - # Generate report - print("\nGenerating report...") - report = generate_report(local_results, "benchmark_report.md") - - print("\n" + "=" * 80) - print("BENCHMARK COMPLETE") - print("=" * 80) - print("\nReport preview:\n") - print(report[:3000] + "\n...[truncated]...") - - return local_results - - -if __name__ == "__main__": - main() diff --git a/codex-lens/debug_semantic_search.py b/codex-lens/debug_semantic_search.py deleted file mode 100644 index 57febe31..00000000 --- a/codex-lens/debug_semantic_search.py +++ /dev/null @@ -1,318 +0,0 @@ -#!/usr/bin/env python -"""Debug script to trace semantic search (dense_rerank) flow step by step.""" - -import json -import logging -import sqlite3 -import sys -from pathlib import Path -from typing import Any, Dict, List, Tuple - -# Add src to path -sys.path.insert(0, str(Path(__file__).parent / "src")) - -# Configure detailed logging -logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s | %(levelname)-5s | %(name)s | %(message)s", - datefmt="%H:%M:%S", -) - -# Enable debug for specific modules -for name in ["codexlens.search", "codexlens.semantic", "codexlens.indexing"]: - logging.getLogger(name).setLevel(logging.DEBUG) - -logger = logging.getLogger("debug_semantic") - - -def load_config() -> Dict[str, Any]: - """Load config from codexlens settings.""" - config_path = Path.home() / ".codexlens" / "config.json" - if config_path.exists(): - with open(config_path) as f: - return json.load(f) - return {} - - -def inspect_hnsw_index(index_root: Path) -> Dict[str, Any]: - """Inspect centralized HNSW index metadata.""" - hnsw_path = index_root / "_vectors.hnsw" - meta_path = index_root / "_vectors_meta.db" - - result = { - "hnsw_exists": hnsw_path.exists(), - "meta_exists": meta_path.exists(), - "hnsw_size_mb": round(hnsw_path.stat().st_size / (1024*1024), 2) if hnsw_path.exists() else 0, - } - - if meta_path.exists(): - conn = sqlite3.connect(str(meta_path)) - cursor = conn.execute("SELECT COUNT(*) FROM chunk_metadata") - result["total_chunks"] = cursor.fetchone()[0] - - # Sample file paths - cursor = conn.execute(""" - SELECT DISTINCT file_path FROM chunk_metadata - ORDER BY file_path LIMIT 20 - """) - result["sample_files"] = [row[0] for row in cursor.fetchall()] - - # Check if tests vs src - cursor = conn.execute(""" - SELECT - CASE - WHEN file_path LIKE '%tests%' OR file_path LIKE '%test_%' THEN 'test' - ELSE 'src' - END as category, - COUNT(*) as count - FROM chunk_metadata - GROUP BY category - """) - result["category_distribution"] = {row[0]: row[1] for row in cursor.fetchall()} - - conn.close() - - return result - - -def run_dense_search(query: str, index_root: Path, top_k: int = 50) -> List[Tuple[int, float, str]]: - """Execute dense vector search and return candidates with details.""" - from codexlens.semantic.ann_index import ANNIndex - from codexlens.semantic.factory import get_embedder - from codexlens.semantic.vector_store import VectorStore - - logger.info("=" * 60) - logger.info("STAGE 1: Dense Embedding Generation") - logger.info("=" * 60) - - # Read model config from index - index_db = index_root / "_index.db" - embedding_model = "qwen3-embedding-sf" - embedding_backend = "litellm" - - if index_db.exists(): - try: - with VectorStore(index_db) as vs: - model_config = vs.get_model_config() - if model_config: - embedding_backend = model_config.get("backend", embedding_backend) - embedding_model = model_config.get("model_name", embedding_model) - logger.info(f"Model config from index: {embedding_backend}/{embedding_model}") - except Exception as e: - logger.warning(f"Failed to read model config: {e}") - - # Generate query embedding - embedder = get_embedder(backend=embedding_backend, model=embedding_model) - query_embedding = embedder.embed_to_numpy([query])[0] - logger.info(f"Query: {query!r}") - logger.info(f"Query embedding dim: {query_embedding.shape[0]}") - logger.info(f"Query embedding norm: {(query_embedding**2).sum()**0.5:.4f}") - - # Load HNSW index - logger.info("=" * 60) - logger.info("STAGE 2: HNSW Vector Search (Coarse)") - logger.info("=" * 60) - - ann_index = ANNIndex.create_central( - index_root=index_root, - dim=query_embedding.shape[0], - ) - if not ann_index.load(): - logger.error("Failed to load HNSW index") - return [] - - logger.info(f"HNSW index count: {ann_index.count()}") - - # Execute search - ids, distances = ann_index.search(query_embedding, top_k=top_k) - logger.info(f"Found {len(ids)} candidates") - - # Get chunk details - candidates = [] - meta_path = index_root / "_vectors_meta.db" - if meta_path.exists(): - conn = sqlite3.connect(str(meta_path)) - conn.row_factory = sqlite3.Row - - for chunk_id, distance in zip(ids, distances): - cursor = conn.execute(""" - SELECT file_path, content, start_line, end_line - FROM chunk_metadata WHERE chunk_id = ? - """, (int(chunk_id),)) - row = cursor.fetchone() - if row: - candidates.append(( - int(chunk_id), - float(distance), - row["file_path"], - row["content"][:200] if row["content"] else "", - row["start_line"], - row["end_line"], - )) - conn.close() - - # Print top candidates - logger.info("\nTop 20 Dense Search Candidates:") - logger.info("-" * 80) - for i, (cid, dist, path, content, start, end) in enumerate(candidates[:20]): - score = max(0, 1 - dist) - is_test = "tests/" in path or "test_" in Path(path).name - marker = "[TEST]" if is_test else "[SRC]" - logger.info(f"{i+1:2d}. {marker} dist={dist:.4f} score={score:.4f}") - logger.info(f" {path}:{start}-{end}") - logger.info(f" {content[:100]}...") - logger.info("") - - return candidates - - -def run_reranking(query: str, candidates: List[Tuple], top_k: int = 10) -> List[Tuple[str, float, float]]: - """Execute cross-encoder reranking on candidates.""" - from codexlens.semantic.reranker import get_reranker, check_reranker_available - - logger.info("=" * 60) - logger.info("STAGE 3: Cross-Encoder Reranking") - logger.info("=" * 60) - - # Check reranker availability - config = load_config() - backend = config.get("reranker_backend", "api") - model = config.get("reranker_model", "Qwen/Qwen3-Reranker-8B") - - logger.info(f"Reranker backend: {backend}") - logger.info(f"Reranker model: {model}") - - ok, err = check_reranker_available(backend) - if not ok: - logger.error(f"Reranker not available: {err}") - return [] - - reranker = get_reranker(backend=backend, model_name=model) - - # Prepare pairs for reranking - pairs = [] - for cid, dist, path, content, start, end in candidates[:50]: # Top 50 for reranking - doc_text = content if content else path - pairs.append((query, doc_text)) - - logger.info(f"Reranking {len(pairs)} candidates...") - - # Execute reranking - scores = reranker.score_pairs(pairs, batch_size=32) - - # Combine scores - results = [] - for i, (cid, dist, path, content, start, end) in enumerate(candidates[:len(scores)]): - dense_score = max(0, 1 - dist) - rerank_score = scores[i] - combined = 0.5 * dense_score + 0.5 * rerank_score - is_test = "tests/" in path or "test_" in Path(path).name - results.append((path, dense_score, rerank_score, combined, is_test, content[:100])) - - # Sort by combined score - results.sort(key=lambda x: x[3], reverse=True) - - logger.info("\nTop 20 Reranked Results:") - logger.info("-" * 100) - logger.info(f"{'Rank':>4} {'Type':^6} {'Dense':^8} {'Rerank':^8} {'Combined':^8} Path") - logger.info("-" * 100) - for i, (path, dense, rerank, combined, is_test, content) in enumerate(results[:20]): - marker = "TEST" if is_test else "SRC" - logger.info(f"{i+1:4d} [{marker:^4}] {dense:8.4f} {rerank:8.4f} {combined:8.4f} {path}") - - return results[:top_k] - - -def analyze_problem(candidates: List[Tuple], results: List[Tuple]): - """Analyze why tests might rank higher than src files.""" - logger.info("=" * 60) - logger.info("ANALYSIS: Why Tests Rank Higher?") - logger.info("=" * 60) - - # Count test vs src in dense candidates - test_in_dense = sum(1 for c in candidates[:50] if "tests/" in c[2] or "test_" in Path(c[2]).name) - src_in_dense = 50 - test_in_dense - - logger.info(f"\nDense Search (top 50):") - logger.info(f" - Test files: {test_in_dense} ({test_in_dense*2}%)") - logger.info(f" - Src files: {src_in_dense} ({src_in_dense*2}%)") - - # Average scores by category - test_dense_scores = [max(0, 1-c[1]) for c in candidates[:50] if "tests/" in c[2] or "test_" in Path(c[2]).name] - src_dense_scores = [max(0, 1-c[1]) for c in candidates[:50] if not ("tests/" in c[2] or "test_" in Path(c[2]).name)] - - if test_dense_scores: - logger.info(f"\nDense Score Averages:") - logger.info(f" - Test files: {sum(test_dense_scores)/len(test_dense_scores):.4f}") - if src_dense_scores: - logger.info(f" - Src files: {sum(src_dense_scores)/len(src_dense_scores):.4f}") - - # Check rerank score distribution - test_results = [r for r in results if r[4]] - src_results = [r for r in results if not r[4]] - - if test_results and src_results: - logger.info(f"\nRerank Score Averages:") - logger.info(f" - Test files: {sum(r[2] for r in test_results)/len(test_results):.4f}") - logger.info(f" - Src files: {sum(r[2] for r in src_results)/len(src_results):.4f}") - - logger.info("\n" + "=" * 60) - logger.info("HYPOTHESIS:") - logger.info("=" * 60) - - if test_in_dense > src_in_dense: - logger.info("→ Problem is at DENSE SEARCH stage") - logger.info(" Test files have embeddings closer to query") - logger.info(" Possible causes:") - logger.info(" 1. Test files mention implementation concepts in comments/docstrings") - logger.info(" 2. Embedding model doesn't distinguish between tests and implementation") - logger.info(" 3. Test file chunks are more frequent in the index") - else: - logger.info("→ Problem may be at RERANKING stage") - logger.info(" Reranker gives higher scores to test content") - - -def main(): - query = "文件索引和嵌入向量生成的实现逻辑" - index_root = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3") - - logger.info("=" * 60) - logger.info("DEBUG: Semantic Search Analysis") - logger.info("=" * 60) - logger.info(f"Query: {query}") - logger.info(f"Index root: {index_root}") - logger.info("") - - # Step 1: Inspect index - logger.info("STEP 0: Index Inspection") - logger.info("-" * 60) - index_info = inspect_hnsw_index(index_root) - for k, v in index_info.items(): - if k == "sample_files": - logger.info(f" {k}:") - for f in v[:10]: - logger.info(f" - {f}") - elif k == "category_distribution": - logger.info(f" {k}:") - for cat, count in v.items(): - logger.info(f" - {cat}: {count}") - else: - logger.info(f" {k}: {v}") - logger.info("") - - # Step 2: Dense search - candidates = run_dense_search(query, index_root, top_k=100) - - if not candidates: - logger.error("No candidates from dense search") - return - - # Step 3: Reranking - results = run_reranking(query, candidates, top_k=20) - - # Step 4: Analyze - analyze_problem(candidates, results) - - -if __name__ == "__main__": - main() diff --git a/codex-lens/debug_semantic_v2.py b/codex-lens/debug_semantic_v2.py deleted file mode 100644 index 3c335272..00000000 --- a/codex-lens/debug_semantic_v2.py +++ /dev/null @@ -1,276 +0,0 @@ -#!/usr/bin/env python -"""Debug script v2: Trace the full semantic search flow with detailed logging.""" - -import json -import logging -import sqlite3 -import sys -from collections import defaultdict -from pathlib import Path -from typing import Any, Dict, List, Tuple - -# Add src to path -sys.path.insert(0, str(Path(__file__).parent / "src")) - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s | %(levelname)-5s | %(message)s", - datefmt="%H:%M:%S", -) -logger = logging.getLogger("debug") - - -def count_chunks_by_category(index_root: Path) -> Dict[str, int]: - """Count chunks by category (src vs test) across all indexes.""" - counts = defaultdict(int) - - for db_path in index_root.rglob("_index.db"): - try: - conn = sqlite3.connect(str(db_path)) - cursor = conn.execute(""" - SELECT file_path FROM semantic_chunks - """) - for row in cursor: - path = row[0] - if "tests" in path or "test_" in Path(path).name: - counts["test"] += 1 - else: - counts["src"] += 1 - conn.close() - except: - pass - - return dict(counts) - - -def run_dense_search_with_trace(query: str, source_path: Path) -> List[Dict]: - """Run dense search with detailed tracing.""" - from codexlens.config import Config - from codexlens.search.chain_search import ChainSearchEngine, SearchOptions - from codexlens.storage.registry import Registry - from codexlens.storage.path_mapper import PathMapper - - # Load config - config = Config.load() - registry = Registry(config.data_dir) - mapper = PathMapper(config.data_dir) - - # Create search engine with verbose logging - engine = ChainSearchEngine(registry, mapper, config=config) - engine.logger.setLevel(logging.DEBUG) - - # Set up handler to capture all log output - handler = logging.StreamHandler() - handler.setLevel(logging.DEBUG) - engine.logger.addHandler(handler) - - # Execute cascade search with dense_rerank strategy - options = SearchOptions(depth=-1) # Search all subdirectories - - logger.info("=" * 70) - logger.info("Executing dense_rerank cascade search...") - logger.info(f"Query: {query}") - logger.info(f"Source: {source_path}") - logger.info("=" * 70) - - result = engine.cascade_search( - query=query, - source_path=source_path, - k=20, - coarse_k=100, - options=options, - strategy="dense_rerank" - ) - - # Analyze results - logger.info("\n" + "=" * 70) - logger.info("SEARCH RESULTS ANALYSIS") - logger.info("=" * 70) - - test_count = 0 - src_count = 0 - results_detail = [] - - for i, r in enumerate(result.results): - is_test = "tests" in r.path or "test_" in Path(r.path).name - if is_test: - test_count += 1 - category = "TEST" - else: - src_count += 1 - category = "SRC" - - # Get metadata scores if available - pre_ce_score = r.metadata.get("pre_cross_encoder_score", r.score) - ce_score = r.metadata.get("cross_encoder_score", 0) - ce_prob = r.metadata.get("cross_encoder_prob", 0) - - results_detail.append({ - "rank": i + 1, - "category": category, - "path": r.path, - "score": r.score, - "pre_ce_score": pre_ce_score, - "ce_score": ce_score, - "ce_prob": ce_prob, - "excerpt": r.excerpt[:100] if r.excerpt else "", - }) - - logger.info(f"{i+1:2d}. [{category:4s}] score={r.score:.4f} pre_ce={pre_ce_score:.4f} ce={ce_score:.4f}") - logger.info(f" {r.path}") - if r.excerpt: - logger.info(f" {r.excerpt[:80]}...") - logger.info("") - - logger.info(f"\nSummary: {src_count} SRC files, {test_count} TEST files in top {len(result.results)}") - logger.info(f"Search time: {result.stats.time_ms:.2f}ms") - - return results_detail - - -def compare_coarse_candidates(): - """Compare coarse candidates before and after reranking.""" - from codexlens.config import Config - from codexlens.semantic.factory import get_embedder - from codexlens.semantic.ann_index import ANNIndex - - query = "文件索引和嵌入向量生成的实现逻辑" - config = Config.load() - - # Generate query embedding - embedder = get_embedder(backend="litellm", model="qwen3-embedding-sf") - query_embedding = embedder.embed_to_numpy([query])[0] - - logger.info("=" * 70) - logger.info("COARSE CANDIDATE ANALYSIS (per directory)") - logger.info("=" * 70) - - # Scan all HNSW indexes - index_root = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens") - - all_candidates = [] - - for hnsw_path in index_root.rglob("_index_vectors.hnsw"): - db_path = hnsw_path.parent / "_index.db" - if not db_path.exists(): - continue - - try: - ann_index = ANNIndex(db_path, dim=query_embedding.shape[0]) - if not ann_index.load() or ann_index.count() == 0: - continue - - ids, distances = ann_index.search(query_embedding, top_k=10) - - # Get file paths from chunks - conn = sqlite3.connect(str(db_path)) - conn.row_factory = sqlite3.Row - - dir_name = hnsw_path.parent.relative_to(index_root) - - for chunk_id, dist in zip(ids, distances): - cursor = conn.execute(""" - SELECT file_path, content FROM semantic_chunks WHERE id = ? - """, (int(chunk_id),)) - row = cursor.fetchone() - if row: - is_test = "tests" in row["file_path"] or "test_" in Path(row["file_path"]).name - all_candidates.append({ - "dir": str(dir_name), - "chunk_id": int(chunk_id), - "distance": float(dist), - "score": max(0, 1 - float(dist)), - "is_test": is_test, - "file_path": row["file_path"], - "content_preview": row["content"][:100] if row["content"] else "" - }) - conn.close() - - except Exception as e: - logger.warning(f"Error processing {hnsw_path}: {e}") - - # Sort by distance (closest first) - all_candidates.sort(key=lambda x: x["distance"]) - - logger.info(f"\nTotal coarse candidates across all directories: {len(all_candidates)}") - - # Analyze distribution - test_candidates = [c for c in all_candidates if c["is_test"]] - src_candidates = [c for c in all_candidates if not c["is_test"]] - - logger.info(f"Test files: {len(test_candidates)}") - logger.info(f"Src files: {len(src_candidates)}") - - if test_candidates: - avg_test_dist = sum(c["distance"] for c in test_candidates) / len(test_candidates) - logger.info(f"Avg test distance: {avg_test_dist:.4f}") - if src_candidates: - avg_src_dist = sum(c["distance"] for c in src_candidates) / len(src_candidates) - logger.info(f"Avg src distance: {avg_src_dist:.4f}") - - logger.info("\nTop 30 candidates (combined from all directories):") - logger.info("-" * 90) - for i, c in enumerate(all_candidates[:30]): - cat = "TEST" if c["is_test"] else "SRC" - logger.info(f"{i+1:2d}. [{cat:4s}] dist={c['distance']:.4f} score={c['score']:.4f} dir={c['dir']}") - logger.info(f" {Path(c['file_path']).name}") - - return all_candidates - - -def main(): - logger.info("=" * 70) - logger.info("SEMANTIC SEARCH DEBUG SESSION") - logger.info("=" * 70) - - # Step 1: Count chunks distribution - index_root = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens") - counts = count_chunks_by_category(index_root) - logger.info(f"\nChunk distribution in index:") - logger.info(f" - Test chunks: {counts.get('test', 0)}") - logger.info(f" - Src chunks: {counts.get('src', 0)}") - - # Step 2: Compare coarse candidates - logger.info("\n") - candidates = compare_coarse_candidates() - - # Step 3: Run full search - logger.info("\n") - query = "文件索引和嵌入向量生成的实现逻辑" - source_path = Path(r"D:\Claude_dms3\codex-lens") - results = run_dense_search_with_trace(query, source_path) - - # Summary - logger.info("\n" + "=" * 70) - logger.info("ROOT CAUSE ANALYSIS") - logger.info("=" * 70) - - test_in_top10 = sum(1 for r in results[:10] if r["category"] == "TEST") - src_in_top10 = 10 - test_in_top10 - - logger.info(f"\nTop 10 results: {src_in_top10} SRC, {test_in_top10} TEST") - - if test_in_top10 > src_in_top10: - logger.info("\nPROBLEM: Test files dominate top results") - logger.info("\nPossible causes:") - logger.info(" 1. Test files mention implementation concepts explicitly") - logger.info(" (e.g., docstrings describe what they test)") - logger.info(" 2. Embedding model treats test descriptions as similar to") - logger.info(" implementation descriptions") - logger.info(" 3. Cross-encoder reranker gives higher scores to") - logger.info(" descriptive test content over implementation code") - - # Check if coarse candidates already favor tests - test_in_coarse_top30 = sum(1 for c in candidates[:30] if c["is_test"]) - if test_in_coarse_top30 > 15: - logger.info(f"\n → Dense coarse search already favors tests") - logger.info(f" ({test_in_coarse_top30}/30 test files in coarse top-30)") - logger.info(f" Problem is at EMBEDDING/DENSE SEARCH stage") - else: - logger.info(f"\n → Coarse search is balanced ({test_in_coarse_top30}/30 tests)") - logger.info(f" Problem is at CROSS-ENCODER RERANKING stage") - - -if __name__ == "__main__": - main() diff --git a/codex-lens/dist/codex_lens-0.1.0-py3-none-any.whl b/codex-lens/dist/codex_lens-0.1.0-py3-none-any.whl deleted file mode 100644 index 5b8af4bb..00000000 Binary files a/codex-lens/dist/codex_lens-0.1.0-py3-none-any.whl and /dev/null differ diff --git a/codex-lens/dist/codex_lens-0.1.0.tar.gz b/codex-lens/dist/codex_lens-0.1.0.tar.gz deleted file mode 100644 index 3bfc9729..00000000 Binary files a/codex-lens/dist/codex_lens-0.1.0.tar.gz and /dev/null differ diff --git a/codex-lens/docs/CHAIN_SEARCH_QUICKREF.md b/codex-lens/docs/CHAIN_SEARCH_QUICKREF.md deleted file mode 100644 index dc16e192..00000000 --- a/codex-lens/docs/CHAIN_SEARCH_QUICKREF.md +++ /dev/null @@ -1,171 +0,0 @@ -# Chain Search Quick Reference - -## Import - -```python -from pathlib import Path -from codexlens.search import ( - ChainSearchEngine, - SearchOptions, - quick_search -) -from codexlens.storage.registry import RegistryStore -from codexlens.storage.path_mapper import PathMapper -``` - -## One-Line Search - -```python -results = quick_search("query", Path("/path/to/search"), depth=-1) -``` - -## Full Engine Usage - -### 1. Initialize Engine -```python -registry = RegistryStore() -registry.initialize() -mapper = PathMapper() -engine = ChainSearchEngine(registry, mapper) -``` - -### 2. Configure Search -```python -options = SearchOptions( - depth=-1, # -1 = unlimited, 0 = current dir only - max_workers=8, # Parallel threads - limit_per_dir=10, # Max results per directory - total_limit=100, # Total result limit - include_symbols=False, # Include symbol search - files_only=False # Return only paths -) -``` - -### 3. Execute Search -```python -result = engine.search("query", Path("/path"), options) - -# Access results -for r in result.results: - print(f"{r.path}: score={r.score:.2f}") - print(f" {r.excerpt}") - -# Check statistics -print(f"Searched {result.stats.dirs_searched} directories") -print(f"Found {result.stats.files_matched} files") -print(f"Time: {result.stats.time_ms:.2f}ms") -``` - -### 4. Symbol Search -```python -symbols = engine.search_symbols( - "function_name", - Path("/path"), - kind="function" # Optional: 'function', 'class', 'method', etc. -) - -for sym in symbols: - print(f"{sym.name} ({sym.kind}) at lines {sym.range[0]}-{sym.range[1]}") -``` - -### 5. Files-Only Mode -```python -paths = engine.search_files_only("query", Path("/path")) -for path in paths: - print(path) -``` - -## SearchOptions Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `depth` | int | -1 | Search depth (-1 = unlimited) | -| `max_workers` | int | 8 | Parallel worker threads | -| `limit_per_dir` | int | 10 | Max results per directory | -| `total_limit` | int | 100 | Total result limit | -| `include_symbols` | bool | False | Include symbol search | -| `files_only` | bool | False | Return only file paths | - -## SearchResult Fields - -| Field | Type | Description | -|-------|------|-------------| -| `path` | str | File path | -| `score` | float | BM25 relevance score | -| `excerpt` | str | Highlighted text snippet | -| `content` | str | Full matched content (optional) | -| `symbol` | Symbol | Matched symbol (optional) | - -## SearchStats Fields - -| Field | Type | Description | -|-------|------|-------------| -| `dirs_searched` | int | Number of directories searched | -| `files_matched` | int | Number of files with matches | -| `time_ms` | float | Total search time (milliseconds) | -| `errors` | List[str] | Error messages | - -## Common Patterns - -### Search Current Project -```python -result = engine.search("authentication", Path.cwd()) -``` - -### Limit Depth for Speed -```python -options = SearchOptions(depth=2) # Only 2 levels deep -result = engine.search("TODO", Path("/project"), options) -``` - -### Find All Implementations -```python -symbols = engine.search_symbols("__init__", Path("/project"), kind="function") -``` - -### Quick File List -```python -files = engine.search_files_only("config", Path("/project")) -``` - -### Comprehensive Search -```python -options = SearchOptions( - depth=-1, - total_limit=500, - include_symbols=True -) -result = engine.search("api", Path("/project"), options) -print(f"Files: {len(result.results)}") -print(f"Symbols: {len(result.symbols)}") -``` - -## Performance Tips - -1. **Use depth limits** for faster searches in large codebases -2. **Use files_only** when you don't need excerpts -3. **Reuse ChainSearchEngine** instance for multiple searches -4. **Adjust max_workers** based on CPU cores -5. **Use limit_per_dir** to reduce memory usage - -## Error Handling - -```python -result = engine.search("query", Path("/path")) - -if result.stats.errors: - print("Errors occurred:") - for error in result.stats.errors: - print(f" - {error}") - -if not result.results: - print("No results found") -else: - print(f"Found {len(result.results)} results") -``` - -## Cleanup - -```python -registry.close() # Close when done -``` diff --git a/codex-lens/docs/CODEXLENS_LSP_API_SPEC.md b/codex-lens/docs/CODEXLENS_LSP_API_SPEC.md deleted file mode 100644 index fc2be840..00000000 --- a/codex-lens/docs/CODEXLENS_LSP_API_SPEC.md +++ /dev/null @@ -1,676 +0,0 @@ -# Codexlens LSP API 规范 - -**版本**: 1.1 -**状态**: ✅ APPROVED (Gemini Review) -**架构**: codexlens 提供 Python API,CCW 实现 MCP 端点 -**分析来源**: Gemini (架构评审) + Codex (实现评审) -**最后更新**: 2025-01-17 - ---- - -## 一、概述 - -### 1.1 背景 - -基于 cclsp MCP 服务器实现的分析,设计 codexlens 的 LSP 搜索方法接口,为 AI 提供代码智能能力。 - -### 1.2 架构决策 - -**MCP 端点由 CCW 实现,codexlens 只提供 Python API** - -``` -┌─────────────────────────────────────────────────────────────┐ -│ Claude Code │ -│ ┌───────────────────────────────────────────────────────┐ │ -│ │ MCP Client │ │ -│ └───────────────────────────────────────────────────────┘ │ -│ │ │ -│ ▼ │ -│ ┌───────────────────────────────────────────────────────┐ │ -│ │ CCW MCP Server │ │ -│ │ ┌─────────────────────────────────────────────────┐ │ │ -│ │ │ MCP Tool Handlers │ │ │ -│ │ │ • codexlens_file_context │ │ │ -│ │ │ • codexlens_find_definition │ │ │ -│ │ │ • codexlens_find_references │ │ │ -│ │ │ • codexlens_semantic_search │ │ │ -│ │ └──────────────────────┬──────────────────────────┘ │ │ -│ └─────────────────────────┼─────────────────────────────┘ │ -└────────────────────────────┼────────────────────────────────┘ - │ Python API 调用 - ▼ -┌─────────────────────────────────────────────────────────────┐ -│ codexlens │ -│ ┌───────────────────────────────────────────────────────┐ │ -│ │ Public API Layer │ │ -│ │ codexlens.api.file_context() │ │ -│ │ codexlens.api.find_definition() │ │ -│ │ codexlens.api.find_references() │ │ -│ │ codexlens.api.semantic_search() │ │ -│ └──────────────────────┬────────────────────────────────┘ │ -│ │ │ -│ ┌──────────────────────▼────────────────────────────────┐ │ -│ │ Core Components │ │ -│ │ GlobalSymbolIndex | ChainSearchEngine | HoverProvider │ │ -│ └───────────────────────────────────────────────────────┘ │ -│ │ │ -│ ┌──────────────────────▼────────────────────────────────┐ │ -│ │ SQLite Index Databases │ │ -│ │ global_symbols.db | *.index.db (per-directory) │ │ -│ └───────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────┘ -``` - -### 1.3 职责分离 - -| 组件 | 职责 | -|------|------| -| **codexlens** | Python API、索引查询、搜索算法、结果聚合、降级处理 | -| **CCW** | MCP 协议、参数校验、结果序列化、错误处理、project_root 推断 | - -### 1.4 codexlens vs cclsp 对比 - -| 特性 | cclsp | codexlens | -|------|-------|-----------| -| 数据源 | 实时 LSP 服务器 | 预建 SQLite 索引 | -| 启动时间 | 200-3000ms | <50ms | -| 响应时间 | 50-500ms | <5ms | -| 跨语言 | 每语言需要 LSP 服务器 | 统一 Python/TS/JS/Go 索引 | -| 依赖 | 需要语言服务器 | 无外部依赖 | -| 准确度 | 100% (编译器级) | 95%+ (tree-sitter) | -| 重命名支持 | 是 | 否 (只读索引) | -| 实时诊断 | 是 | 通过 IDE MCP | - -**推荐**: codexlens 用于快速搜索,cclsp 用于精确重构 - ---- - -## 二、cclsp 设计模式 (参考) - -### 2.1 MCP 工具接口设计 - -| 模式 | 说明 | 代码位置 | -|------|------|----------| -| **基于名称** | 接受 `symbol_name` 而非文件坐标 | `index.ts:70` | -| **安全消歧义** | `rename_symbol` → `rename_symbol_strict` 两步 | `index.ts:133, 172` | -| **复杂性抽象** | 隐藏 LSP 协议细节 | `index.ts:211` | -| **优雅失败** | 返回有用的文本响应 | 全局 | - -### 2.2 符号解析算法 - -``` -1. getDocumentSymbols (lsp-client.ts:1406) - └─ 获取文件所有符号 - -2. 处理两种格式: - ├─ DocumentSymbol[] → 扁平化 - └─ SymbolInformation[] → 二次定位 - -3. 过滤: symbol.name === symbolName && symbol.kind - -4. 回退: 无结果时移除 kind 约束重试 - -5. 聚合: 遍历所有匹配,聚合定义位置 -``` - ---- - -## 三、需求规格 - -### 需求 1: 文件上下文查询 (`file_context`) - -**用途**: 读取代码文件,返回文件中所有方法的调用关系摘要 - -**输出示例**: -```markdown -## src/auth/login.py (3 methods) - -### login_user (line 15-45) -- Calls: validate_password (auth/utils.py:23), create_session (session/manager.py:89) -- Called by: handle_login_request (api/routes.py:156), test_login (tests/test_auth.py:34) - -### validate_token (line 47-62) -- Calls: decode_jwt (auth/jwt.py:12) -- Called by: auth_middleware (middleware/auth.py:28) -``` - -### 需求 2: 通用 LSP 搜索 (cclsp 兼容) - -| 端点 | 用途 | -|------|------| -| `find_definition` | 根据符号名查找定义位置 | -| `find_references` | 查找符号的所有引用 | -| `workspace_symbols` | 工作区符号搜索 | -| `get_hover` | 获取符号悬停信息 | - -### 需求 3: 向量 + LSP 融合搜索 - -**用途**: 结合向量语义搜索和结构化 LSP 搜索 - -**融合策略**: -- **RRF** (首选): 简单、不需要分数归一化、鲁棒 -- **Cascade**: 特定场景,先向量后 LSP -- **Adaptive**: 长期目标,按查询类型自动选择 - ---- - -## 四、API 规范 - -### 4.1 模块结构 - -``` -src/codexlens/ -├─ api/ [新增] 公开 API 层 -│ ├─ __init__.py 导出所有 API -│ ├─ file_context.py 文件上下文 -│ ├─ definition.py 定义查找 -│ ├─ references.py 引用查找 -│ ├─ symbols.py 符号搜索 -│ ├─ hover.py 悬停信息 -│ └─ semantic.py 语义搜索 -│ -├─ storage/ -│ ├─ global_index.py [扩展] get_file_symbols() -│ └─ relationship_query.py [新增] 有向调用查询 -│ -└─ search/ - └─ chain_search.py [修复] schema 兼容 -``` - -### 4.2 `codexlens.api.file_context()` - -```python -from dataclasses import dataclass, field -from typing import List, Optional, Dict, Tuple - -@dataclass -class CallInfo: - """调用关系信息""" - symbol_name: str - file_path: Optional[str] # 目标文件 (可能为 None) - line: int - relationship: str # call | import | inheritance - -@dataclass -class MethodContext: - """方法上下文""" - name: str - kind: str # function | method | class - line_range: Tuple[int, int] - signature: Optional[str] - calls: List[CallInfo] # 出向调用 - callers: List[CallInfo] # 入向调用 - -@dataclass -class FileContextResult: - """文件上下文结果""" - file_path: str - language: str - methods: List[MethodContext] - summary: str # 人类可读摘要 - discovery_status: Dict[str, bool] = field(default_factory=lambda: { - "outgoing_resolved": False, - "incoming_resolved": True, - "targets_resolved": False - }) - -def file_context( - project_root: str, - file_path: str, - include_calls: bool = True, - include_callers: bool = True, - max_depth: int = 1, - format: str = "brief" # brief | detailed | tree -) -> FileContextResult: - """ - 获取代码文件的方法调用上下文。 - - Args: - project_root: 项目根目录 (用于定位索引) - file_path: 代码文件路径 - include_calls: 是否包含出向调用 - include_callers: 是否包含入向调用 - max_depth: 调用链深度 (1=直接调用) - ⚠️ V1 限制: 当前版本仅支持 max_depth=1 - 深度调用链分析将在 V2 实现 - format: 输出格式 - - Returns: - FileContextResult - - Raises: - IndexNotFoundError: 项目未索引 - FileNotFoundError: 文件不存在 - - Note: - V1 实现限制: - - max_depth 仅支持 1 (直接调用) - - 出向调用目标文件可能为 None (未解析) - - 深度调用链分析作为 V2 特性规划 - """ -``` - -### 4.3 `codexlens.api.find_definition()` - -```python -@dataclass -class DefinitionResult: - """定义查找结果""" - name: str - kind: str - file_path: str - line: int - end_line: int - signature: Optional[str] - container: Optional[str] # 所属类/模块 - score: float - -def find_definition( - project_root: str, - symbol_name: str, - symbol_kind: Optional[str] = None, - file_context: Optional[str] = None, - limit: int = 10 -) -> List[DefinitionResult]: - """ - 根据符号名称查找定义位置。 - - Fallback 策略: - 1. 精确匹配 + kind 过滤 - 2. 精确匹配 (移除 kind) - 3. 前缀匹配 - """ -``` - -### 4.4 `codexlens.api.find_references()` - -```python -@dataclass -class ReferenceResult: - """引用结果""" - file_path: str - line: int - column: int - context_line: str - relationship: str # call | import | type_annotation | inheritance - -@dataclass -class GroupedReferences: - """按定义分组的引用""" - definition: DefinitionResult - references: List[ReferenceResult] - -def find_references( - project_root: str, - symbol_name: str, - symbol_kind: Optional[str] = None, - include_definition: bool = True, - group_by_definition: bool = True, - limit: int = 100 -) -> List[GroupedReferences]: - """ - 查找符号的所有引用位置。 - - 多定义时分组返回,解决引用混淆问题。 - """ -``` - -### 4.5 `codexlens.api.workspace_symbols()` - -```python -@dataclass -class SymbolInfo: - """符号信息""" - name: str - kind: str - file_path: str - line: int - container: Optional[str] - score: float - -def workspace_symbols( - project_root: str, - query: str, - kind_filter: Optional[List[str]] = None, - file_pattern: Optional[str] = None, - limit: int = 50 -) -> List[SymbolInfo]: - """在整个工作区搜索符号 (前缀匹配)。""" -``` - -### 4.6 `codexlens.api.get_hover()` - -```python -@dataclass -class HoverInfo: - """悬停信息""" - name: str - kind: str - signature: str - documentation: Optional[str] - file_path: str - line_range: Tuple[int, int] - type_info: Optional[str] - -def get_hover( - project_root: str, - symbol_name: str, - file_path: Optional[str] = None -) -> Optional[HoverInfo]: - """获取符号的详细悬停信息。""" -``` - -### 4.7 `codexlens.api.semantic_search()` - -```python -@dataclass -class SemanticResult: - """语义搜索结果""" - symbol_name: str - kind: str - file_path: str - line: int - vector_score: Optional[float] - structural_score: Optional[float] - fusion_score: float - snippet: str - match_reason: Optional[str] - -def semantic_search( - project_root: str, - query: str, - mode: str = "fusion", # vector | structural | fusion - vector_weight: float = 0.5, - structural_weight: float = 0.3, - keyword_weight: float = 0.2, - fusion_strategy: str = "rrf", # rrf | staged | binary | hybrid - kind_filter: Optional[List[str]] = None, - limit: int = 20, - include_match_reason: bool = False -) -> List[SemanticResult]: - """ - 语义搜索 - 结合向量和结构化搜索。 - - Args: - project_root: 项目根目录 - query: 自然语言查询 - mode: 搜索模式 - - vector: 仅向量搜索 - - structural: 仅结构搜索 (符号 + 关系) - - fusion: 融合搜索 (默认) - vector_weight: 向量搜索权重 [0, 1] - structural_weight: 结构搜索权重 [0, 1] - keyword_weight: 关键词搜索权重 [0, 1] - fusion_strategy: 融合策略 (映射到 chain_search.py) - - rrf: Reciprocal Rank Fusion (推荐,默认) - - staged: 分阶段级联 → staged_cascade_search - - binary: 二分重排级联 → binary_rerank_cascade_search - - hybrid: 混合级联 → hybrid_search - kind_filter: 符号类型过滤 - limit: 最大返回数量 - include_match_reason: 是否生成匹配原因 (启发式,非 LLM) - - Returns: - 按 fusion_score 排序的结果列表 - - 降级行为: - - 无向量索引: vector_score=None, 使用 FTS + 结构搜索 - - 无关系数据: structural_score=None, 仅向量搜索 - """ -``` - ---- - -## 五、已知问题与解决方案 - -### 5.1 P0 阻塞项 - -| 问题 | 位置 | 解决方案 | -|------|------|----------| -| **索引 Schema 不匹配** | `chain_search.py:313-324` vs `dir_index.py:304-312` | 兼容 `full_path` 和 `path` | -| **文件符号查询缺失** | `global_index.py:214-260` | 新增 `get_file_symbols()` | -| **出向调用查询缺失** | `dir_index.py:333-342` | 新增 `RelationshipQuery` | -| **关系类型不一致** | `entities.py:74-79` | 规范化 `calls` → `call` | - -### 5.2 设计缺陷 (Gemini 发现) - -| 缺陷 | 影响 | 解决方案 | -|------|------|----------| -| **调用图不完整** | `file_context` 缺少出向调用 | 新增有向调用 API | -| **消歧义未定义** | 多定义时无法区分 | 实现 `rank_by_proximity()` | -| **AI 特性成本过高** | `explanation` 需要 LLM | 设为可选,默认关闭 | -| **融合参数不一致** | 3 分支但只有 2 权重 | 补充 `keyword_weight` | - -### 5.3 消歧义算法 - -**V1 实现** (基于文件路径接近度): - -```python -def rank_by_proximity( - results: List[DefinitionResult], - file_context: str -) -> List[DefinitionResult]: - """按文件接近度排序 (V1: 路径接近度)""" - def proximity_score(result): - # 1. 同目录最高分 - if os.path.dirname(result.file_path) == os.path.dirname(file_context): - return 100 - # 2. 共同路径前缀长度 - common = os.path.commonpath([result.file_path, file_context]) - return len(common) - - return sorted(results, key=proximity_score, reverse=True) -``` - -**V2 增强计划** (基于 import graph 距离): - -```python -def rank_by_import_distance( - results: List[DefinitionResult], - file_context: str, - import_graph: Dict[str, Set[str]] -) -> List[DefinitionResult]: - """按 import graph 距离排序 (V2)""" - def import_distance(result): - # BFS 计算最短 import 路径 - return bfs_shortest_path( - import_graph, - file_context, - result.file_path - ) - - # 组合: 0.6 * import_distance + 0.4 * path_proximity - return sorted(results, key=lambda r: ( - 0.6 * import_distance(r) + - 0.4 * (100 - proximity_score(r)) - )) -``` - -### 5.4 参考实现: `get_file_symbols()` - -**位置**: `src/codexlens/storage/global_index.py` - -```python -def get_file_symbols(self, file_path: str | Path) -> List[Symbol]: - """ - 获取指定文件中定义的所有符号。 - - Args: - file_path: 文件路径 (相对或绝对) - - Returns: - 按行号排序的符号列表 - """ - file_path_str = str(Path(file_path).resolve()) - with self._lock: - conn = self._get_connection() - rows = conn.execute( - """ - SELECT symbol_name, symbol_kind, file_path, start_line, end_line - FROM global_symbols - WHERE project_id = ? AND file_path = ? - ORDER BY start_line - """, - (self.project_id, file_path_str), - ).fetchall() - - return [ - Symbol( - name=row["symbol_name"], - kind=row["symbol_kind"], - range=(row["start_line"], row["end_line"]), - file=row["file_path"], - ) - for row in rows - ] -``` - ---- - -## 六、实现计划 - -### Phase 0: 基础设施 (16h) - -| 任务 | 工时 | 说明 | -|------|------|------| -| 修复 `search_references` schema | 4h | 兼容两种 schema | -| 新增 `GlobalSymbolIndex.get_file_symbols()` | 4h | 文件符号查询 (见 5.4) | -| 新增 `RelationshipQuery` 类 | 6h | 有向调用查询 | -| 关系类型规范化层 | 2h | `calls` → `call` | - -### Phase 1: API 层 (48h) - -| 任务 | 工时 | 复杂度 | -|------|------|--------| -| `find_definition()` | 4h | S | -| `find_references()` | 8h | M | -| `workspace_symbols()` | 4h | S | -| `get_hover()` | 4h | S | -| `file_context()` | 16h | L | -| `semantic_search()` | 12h | M | - -### Phase 2: 测试与文档 (16h) - -| 任务 | 工时 | -|------|------| -| 单元测试 (≥80%) | 8h | -| API 文档 | 4h | -| 示例代码 | 4h | - -### 关键路径 - -``` -Phase 0.1 (schema fix) - ↓ -Phase 0.2 (file symbols) → Phase 1.5 (file_context) - ↓ -Phase 1 (其他 API) - ↓ -Phase 2 (测试) -``` - ---- - -## 七、测试策略 - -### 7.1 单元测试 - -```python -# test_global_index.py -def test_get_file_symbols(): - index = GlobalSymbolIndex(":memory:") - index.update_file_symbols(project_id=1, file_path="test.py", symbols=[...]) - results = index.get_file_symbols("test.py") - assert len(results) == 3 - -# test_relationship_query.py -def test_outgoing_calls(): - store = DirIndexStore(":memory:") - calls = store.get_outgoing_calls("src/auth.py", "login") - assert calls[0].relationship == "call" # 已规范化 -``` - -### 7.2 Schema 兼容性测试 - -```python -def test_search_references_both_schemas(): - """测试两种 schema 的引用搜索""" - # 旧 schema: files(path, ...) - # 新 schema: files(full_path, ...) -``` - -### 7.3 降级测试 - -```python -def test_semantic_search_without_vectors(): - result = semantic_search(query="auth", mode="fusion") - assert result.vector_score is None - assert result.fusion_score > 0 -``` - ---- - -## 八、使用示例 - -```python -from codexlens.api import ( - file_context, - find_definition, - find_references, - semantic_search -) - -# 1. 获取文件上下文 -result = file_context( - project_root="/path/to/project", - file_path="src/auth/login.py", - format="brief" -) -print(result.summary) - -# 2. 查找定义 -definitions = find_definition( - project_root="/path/to/project", - symbol_name="UserService", - symbol_kind="class" -) - -# 3. 语义搜索 -results = semantic_search( - project_root="/path/to/project", - query="处理用户登录验证的函数", - mode="fusion" -) -``` - ---- - -## 九、CCW 集成 - -| codexlens API | CCW MCP Tool | -|---------------|--------------| -| `file_context()` | `codexlens_file_context` | -| `find_definition()` | `codexlens_find_definition` | -| `find_references()` | `codexlens_find_references` | -| `workspace_symbols()` | `codexlens_workspace_symbol` | -| `get_hover()` | `codexlens_get_hover` | -| `semantic_search()` | `codexlens_semantic_search` | - ---- - -## 十、分析来源 - -| 工具 | Session ID | 贡献 | -|------|------------|------| -| Gemini | `1768618654438-gemini` | 架构评审、设计缺陷、融合策略 | -| Codex | `1768618658183-codex` | 组件复用、复杂度估算、任务分解 | -| Gemini | `1768620615744-gemini` | 最终评审、改进建议、APPROVED | - ---- - -## 十一、版本历史 - -| 版本 | 日期 | 变更 | -|------|------|------| -| 1.0 | 2025-01-17 | 初始版本,合并多文档 | -| 1.1 | 2025-01-17 | 应用 Gemini 评审改进: V1 限制说明、策略映射、消歧义增强、参考实现 | diff --git a/codex-lens/docs/CODEX_LENS_AUTO_HYBRID.md b/codex-lens/docs/CODEX_LENS_AUTO_HYBRID.md deleted file mode 100644 index 5f95e22e..00000000 --- a/codex-lens/docs/CODEX_LENS_AUTO_HYBRID.md +++ /dev/null @@ -1,326 +0,0 @@ -# CodexLens Auto Hybrid Mode - Implementation Summary - -## 概述 - -实现了两个主要功能: -1. **自动向量嵌入生成**:`init` 命令在检测到语义搜索依赖后自动生成向量嵌入 -2. **默认混合搜索模式**:`search` 命令在检测到嵌入存在时自动使用 hybrid 模式 - -## 修改文件 - -### 1. codex-lens CLI (`codex-lens/src/codexlens/cli/commands.py`) - -#### 1.1 `init` 命令增强 - -**新增参数**: -- `--no-embeddings`: 跳过自动嵌入生成 -- `--embedding-model`: 指定嵌入模型 (默认: "code") - -**自动嵌入生成逻辑**: -```python -# 在 init 成功后 -if not no_embeddings: - from codexlens.semantic import SEMANTIC_AVAILABLE - if SEMANTIC_AVAILABLE: - # 自动调用 generate_embeddings() - # 使用指定的 embedding_model -``` - -**行为**: -- 检测 `fastembed` 和 `numpy` 是否安装 -- 如果可用,自动生成嵌入(可用 `--no-embeddings` 跳过) -- 默认使用 "code" 模型 (jinaai/jina-embeddings-v2-base-code) -- 在输出中显示嵌入生成进度和统计 - -#### 1.2 `search` 命令增强 - -**模式变更**: -- 默认模式从 `"exact"` 改为 `"auto"` -- 新增 `"auto"` 模式到有效模式列表 - -**自动模式检测逻辑**: -```python -if mode == "auto": - # 检查项目是否有嵌入 - project_record = registry.find_by_source_path(str(search_path)) - if project_record: - embed_status = check_embeddings_status(index_path) - if has_embeddings: - actual_mode = "hybrid" # 使用混合模式 - else: - actual_mode = "exact" # 降级到精确模式 -``` - -**行为**: -- 默认使用 `auto` 模式 -- 自动检测索引是否有嵌入 -- 有嵌入 → 使用 `hybrid` 模式(精确 + 模糊 + 向量融合) -- 无嵌入 → 使用 `exact` 模式(仅全文搜索) -- 用户仍可手动指定模式覆盖自动检测 - -### 2. MCP 工具简化 (`ccw/src/tools/codex-lens.ts`) - -#### 2.1 简化 action 枚举 - -**仅暴露核心操作**: -- `init`: 初始化索引(自动生成嵌入) -- `search`: 搜索代码(自动混合模式) -- `search_files`: 搜索文件路径 - -**移除的高级操作**(仍可通过 CLI 使用): -- ~~`symbol`~~: 符号提取 → 使用 `codexlens symbol` -- ~~`status`~~: 状态检查 → 使用 `codexlens status` -- ~~`config_show/set/migrate`~~: 配置管理 → 使用 `codexlens config` -- ~~`clean`~~: 清理索引 → 使用 `codexlens clean` -- ~~`bootstrap/check`~~: 安装管理 → 自动处理 - -**简化的 ParamsSchema**: -```typescript -const ParamsSchema = z.object({ - action: z.enum(['init', 'search', 'search_files']), - path: z.string().optional(), - query: z.string().optional(), - mode: z.enum(['auto', 'text', 'semantic', 'exact', 'fuzzy', 'hybrid', 'vector', 'pure-vector']).default('auto'), - languages: z.array(z.string()).optional(), - limit: z.number().default(20), -}); -``` - -#### 2.2 扩展 mode 枚举并设置默认值 - -**模式支持**: -```typescript -mode: z.enum(['auto', 'text', 'semantic', 'exact', 'fuzzy', 'hybrid', 'vector', 'pure-vector']).default('auto') -``` - -**模式映射**(MCP → CLI): -```typescript -const modeMap: Record = { - 'text': 'exact', - 'semantic': 'pure-vector', - 'auto': 'auto', // 默认:自动检测 - 'exact': 'exact', - 'fuzzy': 'fuzzy', - 'hybrid': 'hybrid', - 'vector': 'vector', - 'pure-vector': 'pure-vector', -}; -``` - -#### 2.3 传递 mode 参数到 CLI - -```typescript -const args = ['search', query, '--limit', limit.toString(), '--mode', cliMode, '--json']; -``` - -### 3. 文档更新 (`.claude/rules/context-requirements.md`) - -#### 3.1 更新 init 说明 - -强调自动嵌入生成功能: -```markdown -**NEW**: `init` automatically generates vector embeddings if semantic dependencies are installed (fastembed). -- Auto-detects if `numpy` and `fastembed` are available -- Uses "code" model by default (jinaai/jina-embeddings-v2-base-code) -- Skip with `--no-embeddings` flag if needed -``` - -#### 3.2 更新 search 说明 - -强调自动混合模式: -```markdown -**Search Code** (Auto Hybrid Mode - DEFAULT): -# Simple call - auto-detects mode (hybrid if embeddings exist, exact otherwise): -codex_lens(action="search", query="authentication", path=".", limit=20) -``` - -#### 3.3 详细模式说明 - -添加完整的模式列表和默认行为说明: -- `auto`: **DEFAULT** - Uses hybrid if embeddings exist, exact otherwise -- `hybrid`: Exact + Fuzzy + Vector fusion (best results, auto-selected if embeddings exist) -- 其他模式... - -## 使用示例 - -### 场景 1:首次使用(已安装 fastembed) - -```bash -# 初始化索引(自动生成嵌入) -codexlens init . - -# 输出: -# OK Indexed 150 files in 12 directories -# -# Generating embeddings... -# Model: code -# ✓ Generated 1234 embeddings in 45.2s - -# 搜索(自动使用 hybrid 模式) -codexlens search "authentication" -# Mode: hybrid | Searched 12 directories in 15.2ms -``` - -### 场景 2:首次使用(未安装 fastembed) - -```bash -# 初始化索引(跳过嵌入) -codexlens init . - -# 输出: -# OK Indexed 150 files in 12 directories -# (无嵌入生成提示) - -# 搜索(降级到 exact 模式) -codexlens search "authentication" -# Mode: exact | Searched 12 directories in 8.5ms -``` - -### 场景 3:手动控制 - -```bash -# 跳过嵌入生成 -codexlens init . --no-embeddings - -# 强制使用特定模式 -codexlens search "auth" --mode exact -codexlens search "how to authenticate" --mode hybrid -``` - -### 场景 4:MCP 工具使用(简化版) - -```python -# 初始化(自动生成嵌入) -codex_lens(action="init", path=".") - -# 搜索(默认 auto 模式:有嵌入用 hybrid,无嵌入用 exact) -codex_lens(action="search", query="authentication") - -# 强制混合模式 -codex_lens(action="search", query="authentication", mode="hybrid") - -# 强制精确模式 -codex_lens(action="search", query="authenticate_user", mode="exact") - -# 仅返回文件路径 -codex_lens(action="search_files", query="payment processing") -``` - -**高级操作使用 CLI**: -```bash -# 检查状态 -codexlens status - -# 提取符号 -codexlens symbol src/auth/login.js - -# 配置管理 -codexlens config show -codexlens config set index_dir /custom/path - -# 清理索引 -codexlens clean . -``` - -## 技术细节 - -### 嵌入检测逻辑 - -1. 查找项目在 registry 中的记录 -2. 获取索引路径 `index_root/_index.db` -3. 调用 `check_embeddings_status()` 检查: - - 是否存在 `chunks` 表 - - `chunks_count > 0` -4. 根据检测结果选择模式 - -### 混合搜索权重 - -默认 RRF 权重: -- Exact FTS: 0.4 -- Fuzzy FTS: 0.3 -- Vector: 0.3 - -可通过 `--weights` 参数自定义: -```bash -codexlens search "query" --mode hybrid --weights 0.5,0.3,0.2 -``` - -### 模型选项 - -| 模型 | 模型名称 | 维度 | 大小 | 推荐场景 | -|------|---------|------|------|---------| -| fast | BAAI/bge-small-en-v1.5 | 384 | ~80MB | 快速原型 | -| code | jinaai/jina-embeddings-v2-base-code | 768 | ~150MB | **推荐** 代码搜索 | -| multilingual | intfloat/multilingual-e5-large | 1024 | ~1GB | 多语言项目 | -| balanced | mixedbread-ai/mxbai-embed-large-v1 | 1024 | ~600MB | 平衡性能 | - -## 兼容性 - -### 向后兼容 - -- 所有现有命令仍然工作 -- 手动指定 `--mode` 会覆盖自动检测 -- 使用 `--no-embeddings` 可恢复旧行为 - -### 依赖要求 - -**核心功能**(无需额外依赖): -- FTS 索引(exact, fuzzy) -- 符号提取 - -**语义搜索功能**(需要安装): -```bash -pip install codexlens[semantic] -# 或 -pip install numpy fastembed -``` - -## 性能影响 - -### 初始化时间 - -- FTS 索引:~2-5 秒(100 文件) -- 嵌入生成:+30-60 秒(首次下载模型) -- 后续嵌入:+10-20 秒 - -### 搜索性能 - -| 模式 | 延迟 | 召回率 | 推荐场景 | -|------|------|--------|---------| -| exact | 5ms | 中 | 精确代码标识符 | -| fuzzy | 7ms | 中 | 容错搜索 | -| hybrid | 15ms | **最高** | **通用搜索(推荐)** | -| vector | 12ms | 高 | 语义查询 | -| pure-vector | 10ms | 中 | 自然语言 | - -## 最小化修改原则 - -所有修改都遵循最小化原则: -1. **保持向后兼容**:不破坏现有功能 -2. **默认智能**:自动检测最佳模式 -3. **用户可控**:可通过参数覆盖自动行为 -4. **渐进增强**:未安装 fastembed 时优雅降级 - -## 总结 - -✅ **init 命令自动生成嵌入**(可用 `--no-embeddings` 跳过) -✅ **search 命令默认使用混合模式**(有嵌入时自动启用) -✅ **MCP 工具简化为核心操作**(init, search, search_files) -✅ **所有搜索模式支持**(auto, exact, fuzzy, hybrid, vector, pure-vector) -✅ **文档已更新**反映新的默认行为 -✅ **保持向后兼容性** -✅ **优雅降级**(无 fastembed 时使用 exact 模式) - -### MCP vs CLI 功能对比 - -| 功能 | MCP 工具 | CLI | -|------|---------|-----| -| 初始化索引 | ✅ `codex_lens(action="init")` | ✅ `codexlens init` | -| 搜索代码 | ✅ `codex_lens(action="search")` | ✅ `codexlens search` | -| 搜索文件 | ✅ `codex_lens(action="search_files")` | ✅ `codexlens search --files-only` | -| 检查状态 | ❌ 使用 CLI | ✅ `codexlens status` | -| 提取符号 | ❌ 使用 CLI | ✅ `codexlens symbol` | -| 配置管理 | ❌ 使用 CLI | ✅ `codexlens config` | -| 清理索引 | ❌ 使用 CLI | ✅ `codexlens clean` | - -**设计理念**:MCP 工具专注于高频核心操作(索引、搜索),高级管理操作通过 CLI 执行。 diff --git a/codex-lens/docs/CONFIGURATION.md b/codex-lens/docs/CONFIGURATION.md deleted file mode 100644 index f155c088..00000000 --- a/codex-lens/docs/CONFIGURATION.md +++ /dev/null @@ -1,298 +0,0 @@ -# CodexLens 配置说明 - -## 目录结构 - -``` -~/.codexlens/ # 全局数据目录 -├── .env # 全局 API 配置 (新增) -├── settings.json # 运行时设置 -├── embedding_lock.json # 模型锁定文件 -├── registry.db # 项目注册表 -├── indexes/ # 集中式索引存储 -└── venv/ # Python 虚拟环境 - -project/ -├── .codexlens/ # 工作区本地目录 -│ ├── .env # 工作区 API 配置 (覆盖全局) -│ ├── index.db # 项目索引数据库 -│ ├── cache/ # 缓存目录 -│ └── .gitignore # 排除敏感文件 -└── .env # 项目根目录配置 -``` - -## 配置优先级 - -配置加载顺序 (后者覆盖前者): - -| 优先级 | 位置 | 说明 | -|--------|------|------| -| 1 (最低) | `~/.codexlens/.env` | 全局默认配置 | -| 2 | `project/.env` | 项目根目录配置 | -| 3 | `project/.codexlens/.env` | 工作区本地配置 | -| 4 (最高) | 环境变量 | Shell 环境变量 | - -## 环境变量 - -### Embedding 配置 - -用于 `litellm` 后端的嵌入向量服务: - -```bash -# API 密钥 -EMBEDDING_API_KEY=your-api-key - -# API 基础 URL -EMBEDDING_API_BASE=https://api.example.com/v1 - -# 嵌入模型名称 -EMBEDDING_MODEL=text-embedding-3-small -``` - -**支持的提供商示例**: - -| 提供商 | API Base | 模型示例 | -|--------|----------|----------| -| OpenAI | `https://api.openai.com/v1` | `text-embedding-3-small` | -| ModelScope | `https://api-inference.modelscope.cn/v1` | `Qwen/Qwen3-Embedding-8B` | -| Azure | `https://your-resource.openai.azure.com` | `text-embedding-ada-002` | - -### LiteLLM 配置 - -用于 LLM 功能 (重排序、语义分析等): - -```bash -# API 密钥 -LITELLM_API_KEY=your-api-key - -# API 基础 URL -LITELLM_API_BASE=https://api.example.com/v1 - -# 模型名称 -LITELLM_MODEL=gpt-4o-mini -``` - -### Reranker 配置 - -用于搜索结果重排序 (可选): - -```bash -# API 密钥 -RERANKER_API_KEY=your-api-key - -# API 基础 URL -RERANKER_API_BASE=https://api.siliconflow.cn - -# 提供商: siliconflow, cohere, jina -RERANKER_PROVIDER=siliconflow - -# 重排序模型 -RERANKER_MODEL=BAAI/bge-reranker-v2-m3 -``` - -### 通用配置 - -```bash -# 自定义数据目录 (默认: ~/.codexlens) -CODEXLENS_DATA_DIR=~/.codexlens - -# 启用调试模式 -CODEXLENS_DEBUG=false -``` - -## settings.json - -运行时设置保存在 `~/.codexlens/settings.json`: - -```json -{ - "embedding": { - "backend": "litellm", - "model": "Qwen/Qwen3-Embedding-8B", - "use_gpu": false, - "endpoints": [ - { - "model": "Qwen/Qwen3-Embedding-8B", - "api_key": "${EMBEDDING_API_KEY}", - "api_base": "${EMBEDDING_API_BASE}", - "weight": 1.0 - } - ], - "strategy": "latency_aware", - "cooldown": 60.0 - }, - "llm": { - "enabled": true, - "tool": "gemini", - "timeout_ms": 300000, - "batch_size": 5 - }, - "parsing": { - "use_astgrep": false - }, - "indexing": { - "static_graph_enabled": false, - "static_graph_relationship_types": ["imports", "inherits"] - } -} -``` - -### Embedding 设置 - -| 字段 | 类型 | 说明 | -|------|------|------| -| `backend` | string | `fastembed` (本地) 或 `litellm` (API) | -| `model` | string | 模型名称或配置文件 | -| `use_gpu` | bool | GPU 加速 (仅 fastembed) | -| `endpoints` | array | 多端点配置 (仅 litellm) | -| `strategy` | string | 负载均衡策略 | -| `cooldown` | float | 限流冷却时间 (秒) | - -**Embedding Backend 对比**: - -| 特性 | fastembed | litellm | -|------|-----------|---------| -| 运行方式 | 本地 ONNX | API 调用 | -| 依赖 | 本地模型文件 | API 密钥 | -| 速度 | 快 (本地) | 取决于网络 | -| 模型选择 | 预定义配置文件 | 任意 API 模型 | -| GPU 支持 | 是 | N/A | - -**负载均衡策略**: - -| 策略 | 说明 | -|------|------| -| `round_robin` | 轮询分配 | -| `latency_aware` | 延迟感知 (推荐) | -| `weighted_random` | 加权随机 | - -### LLM 设置 - -| 字段 | 类型 | 说明 | -|------|------|------| -| `enabled` | bool | 启用 LLM 功能 | -| `tool` | string | LLM 工具 (`gemini`, `codex`) | -| `timeout_ms` | int | 超时时间 (毫秒) | -| `batch_size` | int | 批处理大小 | - -### Parsing 设置 - -| 字段 | 类型 | 说明 | -|------|------|------| -| `use_astgrep` | bool | 优先使用 ast-grep 解析关系(实验性;当前主要用于 Python relationships) | - -### Indexing 设置(静态图) - -| 字段 | 类型 | 说明 | -|------|------|------| -| `static_graph_enabled` | bool | 索引时将 relationships 写入全局 `global_relationships`,用于搜索阶段静态图扩展 | -| `static_graph_relationship_types` | array | 允许持久化的关系类型:`imports` / `inherits` / `calls` | - -**CLI 覆盖(单次运行,不写入 settings.json)**: - -```bash -# 索引时启用静态图 relationships + 使用 ast-grep(如果可用) -codexlens index init --use-astgrep --static-graph --static-graph-types imports,inherits,calls -``` - -**Search staged 静态图扩展(高级)**: - -```bash -codexlens search --cascade-strategy staged --staged-stage2-mode static_global_graph -``` - -## FastEmbed 模型配置文件 - -使用 `fastembed` 后端时的预定义模型: - -| 配置文件 | 模型 | 维度 | 大小 | -|----------|------|------|------| -| `fast` | BAAI/bge-small-en-v1.5 | 384 | 80MB | -| `base` | BAAI/bge-base-en-v1.5 | 768 | 220MB | -| `code` | jinaai/jina-embeddings-v2-base-code | 768 | 150MB | -| `minilm` | sentence-transformers/all-MiniLM-L6-v2 | 384 | 90MB | -| `multilingual` | intfloat/multilingual-e5-large | 1024 | 1000MB | -| `balanced` | mixedbread-ai/mxbai-embed-large-v1 | 1024 | 600MB | - -## 快速开始 - -### 1. 使用全局配置 - -创建 `~/.codexlens/.env`: - -```bash -# 复制示例配置 -cp codex-lens/.env.example ~/.codexlens/.env - -# 编辑配置 -nano ~/.codexlens/.env -``` - -### 2. 使用本地嵌入 (fastembed) - -```bash -# 初始化索引 (使用 code 配置文件) -codexlens init --backend fastembed --model code - -# 或使用多语言模型 -codexlens init --backend fastembed --model multilingual -``` - -### 3. 使用 API 嵌入 (litellm) - -```bash -# 设置环境变量 -export EMBEDDING_API_KEY=your-key -export EMBEDDING_API_BASE=https://api.example.com/v1 -export EMBEDDING_MODEL=text-embedding-3-small - -# 初始化索引 -codexlens init --backend litellm --model text-embedding-3-small -``` - -### 4. 验证配置 - -```bash -# 检查配置加载 -codexlens config show - -# 测试嵌入 -codexlens test-embedding "Hello World" -``` - -## 故障排除 - -### 配置未加载 - -检查文件权限和路径: - -```bash -ls -la ~/.codexlens/.env -cat ~/.codexlens/.env -``` - -### API 错误 - -1. 验证 API 密钥有效性 -2. 检查 API Base URL 是否正确 -3. 确认模型名称匹配提供商支持的模型 - -### 模型不兼容 - -如果更换嵌入模型,需要重建索引: - -```bash -# 删除旧索引 -rm -rf project/.codexlens/ - -# 重新初始化 -codexlens init --backend litellm --model new-model -``` - -## 相关文件 - -| 文件 | 说明 | -|------|------| -| `src/codexlens/config.py` | 配置类定义 | -| `src/codexlens/env_config.py` | 环境变量加载 | -| `src/codexlens/cli/model_manager.py` | FastEmbed 模型管理 | -| `src/codexlens/semantic/factory.py` | Embedder 工厂 | diff --git a/codex-lens/docs/DESIGN_EVALUATION_REPORT.md b/codex-lens/docs/DESIGN_EVALUATION_REPORT.md deleted file mode 100644 index ae2e1504..00000000 --- a/codex-lens/docs/DESIGN_EVALUATION_REPORT.md +++ /dev/null @@ -1,1010 +0,0 @@ -# 深度技术评估报告:Codex-Lens 改进方案 - -**评估工具**: Gemini 2.5 Pro -**评估日期**: 2025-12-15 -**评估范围**: 多层次分词器、静态分析语义图谱、Docstring与LLM混合策略 - ---- - -## 执行摘要 - -三个方案目标清晰,层层递进,从优化现有功能(混合策略)到改进核心机制(分词器),再到引入全新能力(语义图谱),共同构成了一个宏伟但可行的代码理解增强蓝图。 - -### 核心评分 - -| 方案 | 完善性评分 | 可行性 | ROI | 技术风险 | 建议优先级 | -|------|-----------|--------|-----|----------|-----------| -| Docstring与LLM混合 | 8.0/10 | ⭐⭐⭐⭐⭐ 高 | ⭐⭐⭐⭐⭐ 极高 | ⭐⭐ 低 | **P0 (立即启动)** | -| 多层次分词器 | 8.0/10 | ⭐⭐⭐⭐ 中高 | ⭐⭐⭐⭐ 高 | ⭐⭐⭐ 中 | **P1 (Q2启动)** | -| 静态分析语义图谱 | 6.0/10 | ⭐⭐ 低 | ⭐⭐⭐⭐⭐ 极高* | ⭐⭐⭐⭐⭐ 极高 | **P2 (需原型验证)** | - -*注:图谱的ROI极高,但前提是技术挑战得以克服 - ---- - -## 1. Docstring与LLM混合策略评估 - -### 1.1 完善性评分 - -| 维度 | 评分 | 说明 | -|------|------|------| -| 架构设计 | 9/10 | 流程清晰,分层策略合理 | -| 实现细节 | 8/10 | 代码示例完整,但提取逻辑可优化 | -| 测试覆盖 | 8/10 | 单元测试和集成测试设计充分 | -| 风险控制 | 7/10 | 识别了主要风险,但降级策略可加强 | -| **平均分** | **8.0/10** | 设计文档非常完整 | - -### 1.2 技术可行性:⭐⭐⭐⭐⭐ 高 - -**可以直接实施的部分**: -- ✅ `DocstringQuality` 枚举和评分逻辑(基于长度和结构) -- ✅ `HybridEnhancer` 的三种策略分支 -- ✅ 成本统计和监控模块 -- ✅ Python docstring解析(Google/NumPy风格) - -**需要优化的部分**: -- ⚠️ **Docstring提取** (`_extract_from_code`):当前基于行号搜索较脆弱 - - **改进建议**:使用tree-sitter AST精确定位函数体内的第一个字符串表达式 - ```python - # 改进后的提取逻辑 - body_node = func_node.child_by_field_name('body') - if body_node and len(body_node.children) > 0: - first_stmt = body_node.children[0] - if first_stmt.type == 'expression_statement': - expr = first_stmt.children[0] - if expr.type in ['string', 'string_literal']: - return extract_string_content(expr) - ``` - -**需要原型验证的模块**: -- 🔬 **质量评估器准确性**:在3-5个真实项目上验证评估准确率 - - 目标:与人工标注对比,准确率达到85%+ - - 方法:收集100个docstring样本,人工标注质量等级,调整阈值 - -### 1.3 性能与效果预测 - -| 指标 | 预测值 | 依据 | -|------|--------|------| -| 搜索质量提升 | +15-25% | docstring保留作者意图,准确性接近100% | -| 成本降低 | 40-60% | 高质量docstring占比越高,节省越多 | -| 索引速度提升 | +30-50% | 跳过完整LLM生成步骤 | -| 元数据准确率 | 95%+ | 使用docstring的符号达到近完美准确性 | - -**成本计算示例**(1000个函数): -``` -假设docstring分布:High 30% | Medium 40% | Low 30% - -纯LLM模式:1000 × 100% = 1000 units -混合模式:300×20% + 400×60% + 300×100% = 600 units -节省:40% - -如果High质量达到50%: -混合模式:500×20% + 300×60% + 200×100% = 480 units -节省:52% -``` - -### 1.4 关键设计盲点 - -#### 盲点1:Docstring与代码不同步 -**问题描述**:代码已修改,docstring未更新,导致元数据不准确。 - -**影响程度**:🔴 高(可能误导用户) - -**改进建议**: -```python -class DocstringFreshnessChecker: - def check_parameter_consistency(self, signature, docstring_params): - """检查参数列表是否匹配""" - actual_params = extract_params_from_signature(signature) - documented_params = set(docstring_params.keys()) - - missing = actual_params - documented_params - extra = documented_params - actual_params - - if missing or extra: - return QualityDowngrade( - from_level='HIGH', - to_level='MEDIUM', - reason=f'Parameter mismatch: missing={missing}, extra={extra}' - ) - - def check_return_type_consistency(self, signature, docstring_returns): - """检查返回值类型注解是否与docstring匹配""" - if has_return_annotation(signature) and docstring_returns: - annotation = get_return_annotation(signature) - # 简单的字符串匹配检查 - if annotation.lower() not in docstring_returns.lower(): - return QualityWarning('Return type mismatch') -``` - -#### 盲点2:结构化信息丢失 -**问题描述**:`_use_docstring_with_llm_keywords` 只使用了summary,丢失了参数、返回值、示例等信息。 - -**影响程度**:🟡 中(影响搜索结果展示的丰富性) - -**改进建议**:扩展 `SemanticMetadata` 数据结构: -```python -@dataclass -class EnhancedSemanticMetadata(SemanticMetadata): - """扩展的语义元数据""" - parameters: Optional[Dict[str, str]] = None # {param_name: description} - returns: Optional[str] = None - raises: Optional[List[str]] = None - examples: Optional[str] = None - - # 搜索结果展示时可以显示更丰富的信息 -``` - -#### 盲点3:多语言docstring提取差异 -**问题描述**:不同语言的docstring格式和位置不同,单一提取器无法通用。 - -**影响程度**:🟡 中(影响多语言支持) - -**改进建议**:语言特定提取器: -```python -class LanguageSpecificExtractor: - EXTRACTORS = { - 'python': PythonDocstringExtractor, - 'javascript': JSDocExtractor, - 'typescript': TSDocExtractor, - 'java': JavadocExtractor, - } - - def extract(self, language, code, symbol): - extractor_class = self.EXTRACTORS.get(language, GenericExtractor) - return extractor_class().extract(code, symbol) - -class JSDocExtractor: - """JavaScript/TypeScript JSDoc在函数定义之前""" - def extract(self, code, symbol): - lines = code.splitlines() - start_line = symbol.range[0] - 1 - - # 向上查找 /** ... */ - for i in range(start_line - 1, max(0, start_line - 20), -1): - if '*/' in lines[i]: - return self._extract_jsdoc_block(lines, i) -``` - -### 1.5 时间估算校准 - -**原估算**:6-8周 -**校准后**:✅ 6-8周(合理) - -**分阶段时间表**: -- Week 1-2: 核心`DocstringExtractor` + `QualityEvaluator` -- Week 3-4: `HybridEnhancer` + 三种策略 -- Week 5-6: 真实项目测试 + 评估器调优 -- Week 7-8: 多语言支持 + CLI集成 - ---- - -## 2. 多层次分词器评估 - -### 2.1 完善性评分 - -| 维度 | 评分 | 说明 | -|------|------|------| -| 架构设计 | 9/10 | 分层思想清晰,数据结构设计合理 | -| 实现细节 | 8/10 | AST遍历逻辑详细,但边界情况处理可加强 | -| 测试覆盖 | 7/10 | 单元测试设计充分,缺少大规模集成测试 | -| 风险控制 | 8/10 | 提出了降级策略和性能优化方案 | -| **平均分** | **8.0/10** | 技术方案完整且可行 | - -### 2.2 技术可行性:⭐⭐⭐⭐ 中高 - -**可以直接实施的部分**: -- ✅ `MacroChunker`(符号级分词)- 复用现有`code_extractor` -- ✅ 数据库schema设计(层级关系存储) -- ✅ 基础的`MicroChunker`(for/while/if/try块提取) - -**需要原型验证的部分**: -- 🔬 **层级化检索权重**:`search_hierarchical`中的`level_weights={1:1.0, 2:0.8}`较主观 - - **验证方法**:构建测试集,对比不同权重策略的搜索结果相关性 - - **实验参数**: - ```python - weight_strategies = [ - {'macro': 1.0, 'micro': 0.5}, # 强调宏观 - {'macro': 1.0, 'micro': 0.8}, # 原设计 - {'macro': 1.0, 'micro': 1.0}, # 平等对待 - {'macro': 0.8, 'micro': 1.0}, # 强调细节 - ] - ``` - -- 🔬 **逻辑块粒度控制**:何时需要二次划分?当前阈值`max_lines=50`需验证 - - **数据收集**:统计真实项目中函数长度分布 - - **A/B测试**:对比阈值30/50/100的搜索效果 - -**技术挑战**: -1. **上下文冗余问题**:父chunk和子chunk的摘要如何避免重复? - - **解决方案**:子chunk的LLM prompt应强调**角色定位** - ``` - # Bad Prompt - "Summarize this for loop" - - # Good Prompt - "This for loop is part of function authenticate_user(). - Describe its specific role in the authentication process." - ``` - -2. **结果聚合与展示**:搜索同时匹配父子chunk时如何展示? - - **UI设计建议**: - ``` - [Match 1] ▼ function authenticate_user() - Score: 0.92 - ├─ Line 45-52: Password validation loop - Score: 0.88 - └─ Line 67-75: Token generation block - Score: 0.85 - - [Match 2] function login_handler() - Score: 0.81 - ``` - -### 2.3 性能与效果预测 - -| 指标 | 预测值 | 说明 | -|------|--------|------| -| 搜索质量提升 | +30-40% | 大函数中精确定位逻辑块 | -| 索引时间增加 | +50-100% | AST深度遍历 + 更多LLM调用 | -| 存储空间增加 | +40-80% | 取决于micro-chunk数量 | -| 检索速度 | ±5% | 精确目标可能更快 | - -**存储空间计算**: -``` -假设平均每个文件10个函数 -每个函数生成1个macro chunk + 平均3个micro chunks -总chunk数:10 × (1 + 3) = 40 chunks/文件 - -相比现有(10 chunks/文件)增长:4倍 - -但使用选择性向量化(只对50%的micro chunks生成向量): -向量索引增长:10 × (1 + 1.5) = 2.5倍 -``` - -### 2.4 关键设计盲点 - -#### 盲点1:选择性向量化的风险 -**问题描述**:基于行数(<5行)跳过向量化,可能遗漏重要的简短逻辑。 - -**影响程度**:🟡 中(影响搜索覆盖率) - -**改进建议**:智能选择策略 -```python -class IntelligentVectorizationSelector: - def should_vectorize(self, chunk: HierarchicalChunk) -> bool: - # 规则1: Level 1总是向量化 - if chunk.metadata.level == 1: - return True - - # 规则2: 复杂度判断(圈复杂度) - complexity = calculate_cyclomatic_complexity(chunk.content) - if complexity >= 3: # 有多个分支 - return True - - # 规则3: 关键词判断 - critical_keywords = ['critical', 'security', 'auth', 'payment'] - if any(kw in chunk.content.lower() for kw in critical_keywords): - return True - - # 规则4: LLM快速判断重要性 - if chunk.metadata.level == 2 and len(chunk.content) < 5: - importance = quick_llm_importance_check(chunk) - return importance > 0.7 - - return False -``` - -#### 盲点2:LLM增强的上下文设计不足 -**问题描述**:文档中micro chunk的prompt未充分利用父chunk信息。 - -**影响程度**:🟡 中(影响元数据质量) - -**改进建议**:上下文感知的prompt模板 -```python -MICRO_CHUNK_PROMPT = """ -PARENT CONTEXT: -- Function: {parent_symbol_name} -- Purpose: {parent_purpose} -- Summary: {parent_summary} - -THIS CODE BLOCK ({chunk_type} at lines {start_line}-{end_line}): -```{language} -{chunk_content} -``` - -TASK: Describe this block's SPECIFIC ROLE in the parent function. -Focus on: -- What does it do within the larger logic flow? -- What intermediate result does it produce? -- How does it contribute to the parent function's goal? - -OUTPUT: 1 sentence describing its role + 3-5 keywords -""" -``` - -#### 盲点3:增量更新的复杂性 -**问题描述**:文件修改后,如何高效地重新索引? - -**影响程度**:🟡 中(影响实用性) - -**改进建议**:智能增量更新 -```python -class IncrementalHierarchicalIndexer: - def update_file(self, file_path: Path): - new_content = file_path.read_text() - new_hash = hashlib.sha256(new_content.encode()).hexdigest() - - # 检查文件级别的变化 - old_hash = self.get_file_hash(file_path) - if new_hash == old_hash: - return # 文件未变化 - - # 提取新的chunks - new_chunks = self.chunker.chunk_file(new_content, file_path) - - # 与旧chunks对比(基于内容hash) - old_chunks = self.get_chunks_by_file(file_path) - - for new_chunk in new_chunks: - new_chunk_hash = hash_chunk_content(new_chunk) - matching_old = find_by_hash(old_chunks, new_chunk_hash) - - if matching_old: - # chunk内容未变,保留旧的embedding和metadata - new_chunk.embedding = matching_old.embedding - new_chunk.metadata = matching_old.metadata - else: - # 新chunk或内容已变,需要重新处理 - self.process_new_chunk(new_chunk) - - # 删除不再存在的旧chunks - self.delete_obsolete_chunks(old_chunks, new_chunks) -``` - -### 2.5 时间估算校准 - -**原估算**:7-10周 -**校准后**:✅ 7-10周(合理) - -**关键里程碑**: -- Week 3: 完成数据库迁移和基础chunker -- Week 6: 完成层级化检索逻辑 -- Week 8: 完成LLM增强集成 -- Week 10: 性能优化和发布 - ---- - -## 3. 静态分析语义图谱评估 - -### 3.1 完善性评分 - -| 维度 | 评分 | 说明 | -|------|------|------| -| 架构设计 | 8/10 | 图模型设计合理,但实现路径模糊 | -| 实现细节 | 6/10 | 核心难点(名称解析)实现过于简化 | -| 测试覆盖 | 5/10 | 测试策略不足,缺少复杂场景覆盖 | -| 风险控制 | 5/10 | 对动态语言的限制和性能瓶颈认识不足 | -| **平均分** | **6.0/10** | 愿景宏大但技术风险极高 | - -### 3.2 技术可行性:⭐⭐ 低(短期完全实现) - -**阿喀琉斯之踵:名称解析 (`NameResolver`)** - -文档中的实现**严重低估了难度**: -```python -# 文档中的简化实现 -def resolve_call_target(self, call_edge, caller_context): - # 策略1: 本地调用 - # 策略2: 方法调用 - # 策略3: 导入的函数(TODO) -``` - -**真实世界的复杂性**: -```python -# Case 1: 复杂导入 -from package.submodule import func as f -from package import * # 星号导入 -import package.module # 模块导入 - -result = f(x) # 需要解析f -> package.submodule.func - -# Case 2: 动态调用 -handler = getattr(module, 'process_' + request_type) -handler() # 静态分析无法确定目标 - -# Case 3: 装饰器包装 -@cache -@retry(max_attempts=3) -def expensive_operation(): - pass - -# 调用时需要解析到原始函数,而非装饰器 - -# Case 4: 类型变量 -processor: Callable = get_processor(config) -processor() # 需要类型推断 - -# Case 5: 上下文管理器 -with get_connection() as conn: - conn.execute(...) # 需要理解__enter__返回值类型 -``` - -**技术债务评估**: -- 完整实现需要一个接近 `pyright` 或 `mypy` 级别的类型推断引擎 -- 这些工具历经多年开发,代码量数十万行 -- 不现实在12-15周内从零实现 - -**建议的务实路径**: -1. **集成现有工具**:调研 `jedi` 或 `pyright` 的API是否可用 -2. **限定范围**:V1只处理简单的本地调用和直接导入 -3. **明确边界**:对无法解析的调用,标记为"动态"并降低置信度 - -### 3.3 性能与效果预测 - -**前提假设**:名称解析能达到70%+的准确率 - -| 指标 | 预测值 | 说明 | -|------|--------|------| -| 搜索维度 | 全新维度 | 支持"影响分析"、"调用链追踪" | -| 开发时间 | **24-30周** | 原估算12-15周过于乐观 | -| 索引时间增加 | +300% | 全量静态分析 + 图构建 | -| 存储空间 | +200-500% | 图数据庞大 | -| 查询速度 | <100ms | 简单调用关系查询 | -| 影响分析 | 数秒 | 全代码库范围的图遍历 | - -**名称解析准确率影响**: -``` -如果准确率只有50%: -- 调用图充满噪音和缺失边 -- 影响分析结果不可信 -- 整个图谱价值大打折扣 - -如果准确率达到85%+: -- 可以支撑实用的影响分析 -- 结合LLM语义,能回答复杂问题 -- 成为代码理解的核心基础设施 -``` - -### 3.4 关键设计盲点 - -#### 盲点1:动态语言的静态分析极限 -**问题描述**:Python高度动态,大量调用关系在运行时才确定。 - -**影响程度**:🔴 极高(根本性限制) - -**改进建议**:混合静态+运行时分析 -```python -class HybridCallGraphBuilder: - def build_graph(self, codebase): - # 阶段1: 静态分析(确定性的调用) - static_graph = self.static_analyzer.build_call_graph(codebase) - - # 阶段2: 运行时数据补充(可选) - if self.config.enable_runtime_profiling: - runtime_data = self.collect_runtime_traces() - static_graph.merge(runtime_data, confidence=0.7) - - # 阶段3: LLM推断(低置信度) - for dynamic_call in static_graph.get_unresolved_calls(): - possible_targets = self.llm_infer_call_target(dynamic_call) - static_graph.add_edges(dynamic_call, possible_targets, confidence=0.5) - - return static_graph -``` - -**运行时数据来源**: -- 集成现有APM工具(如Sentry, DataDog) -- 代码覆盖率报告(如coverage.py) -- 自定义的轻量级tracer - -#### 盲点2:跨语言支持的工程量 -**问题描述**:文档轻描淡写"支持JS/Java",实际上需要为每种语言重写整个分析引擎。 - -**影响程度**:🔴 极高(时间成本巨大) - -**改进建议**:分阶段语言支持 -``` -V1 (6个月): 只支持Python - - 专注于将Python分析做到80%+准确率 - - 建立完整的图存储、查询、LLM增强基础设施 - -V2 (再6个月): 添加JavaScript/TypeScript - - 复用图基础设施 - - 开发JS特定的AST分析器 - -V3 (再6个月): 添加Java - - Java的静态类型使分析更容易 - - 但生态复杂(Maven, Gradle, Spring框架) -``` - -#### 盲点3:增量更新的复杂性 -**问题描述**:当一个核心函数签名改变时,图中所有调用它的边都需要更新。 - -**影响程度**:🟡 中(影响可用性) - -**改进建议**:变更传播队列 -```python -class GraphIncrementalUpdater: - def update_function(self, function_id: str, new_code: str): - old_signature = self.graph.get_node(function_id).signature - new_signature = extract_signature(new_code) - - if old_signature != new_signature: - # 签名变化,需要级联更新 - affected_edges = self.graph.get_edges_targeting(function_id) - - for edge in affected_edges: - # 标记为待更新 - self.update_queue.add(UpdateTask( - edge_id=edge.edge_id, - reason='target_signature_changed', - priority='high' - )) - - # 重新分析函数内部的调用 - new_callees = self.analyzer.extract_calls(new_code) - self.graph.update_edges_from(function_id, new_callees) - - # 后台任务:LLM重新生成语义 - self.llm_queue.add(LLMTask(node_id=function_id)) -``` - -### 3.5 时间估算校准 - -**原估算**:12-15周 -**校准后**:🔴 **24-30周到达可用的V1** - -**现实的里程碑**: -``` -Phase 0: 前置验证 (4-6周) - - NameResolver原型开发和测试 - - 决策点:如果准确率<70%,暂停项目或调整范围 - -Phase 1: 基础图构建 (8周) - - 简单的调用图提取(本地调用+直接导入) - - SQLite图存储和基础查询 - -Phase 2: LLM语义增强 (4周) - - 为节点和边生成语义描述 - - 批量处理优化 - -Phase 3: 高级查询 (6周) - - 影响分析 - - 调用链追踪 - - 数据流基础支持 - -Phase 4: 优化与稳定 (6周) - - 性能优化 - - 增量更新 - - 大规模测试 -``` - -### 3.6 必须的前置验证 - -**NameResolver原型验证 (P0优先级)**: -```python -# 原型验证目标 -class NameResolverPrototype: - """ - 目标:在一个真实的中等复杂度Python项目(~10k行代码,20-30个文件)上测试 - - 成功标准: - 1. 本地函数调用解析准确率 > 95% - 2. 跨文件导入解析准确率 > 80% - 3. 类方法调用解析准确率 > 75% - 4. 整体准确率 > 70% - - 如果失败: - - 调研集成jedi/pyright的可行性 - - 或调整图谱范围(只做本地调用图) - - 或推迟项目,投入更多资源 - """ - - def validate(self, test_project_path: Path): - # 手动标注ground truth - ground_truth = self.load_manual_annotations(test_project_path) - - # 运行原型 - resolved_calls = self.resolve_all_calls(test_project_path) - - # 计算准确率 - metrics = self.calculate_metrics(resolved_calls, ground_truth) - - return ValidationReport( - accuracy=metrics.accuracy, - precision=metrics.precision, - recall=metrics.recall, - false_positives=metrics.fp_examples, - false_negatives=metrics.fn_examples, - ) -``` - ---- - -## 4. 方案间协同分析 - -### 4.1 依赖关系图 - -``` -Docstring混合策略 ──(提供高质量元数据)──> 语义图谱 - │ │ - │ │ - (共享docstring (共享AST分析) - 解析能力) │ - │ │ - v v - 多层次分词器 ────(提供细粒度节点)────> 语义图谱 -``` - -**关键依赖**: -1. **图谱依赖混合策略**:高质量的节点摘要和purpose标签来自混合策略 -2. **图谱和分词器共享AST能力**:可以开发一个统一的`ASTAnalyzer`模块 -3. **分词器增强图谱**:micro chunks可以作为图谱的更细粒度节点 - -### 4.2 协同效应(1+1+1 > 3) - -**场景1:精确代码导航** -``` -用户查询: "Find the password hashing logic in authentication" - -Step 1: 向量搜索(分词器) - -> 定位到 authenticate_user() 函数的 micro chunk (lines 45-52) - -Step 2: 图谱上下文 - -> 显示该函数的所有调用者:login_api(), register_api() - -> 追踪数据流:password变量的传递路径 - -Step 3: 语义元数据(混合策略) - -> 展示函数的docstring:"使用bcrypt进行密码哈希,salt轮数为12" - -> 关联的security标签和注意事项 -``` - -**场景2:影响分析** -``` -用户问题: "If I change User.email validation, what breaks?" - -Step 1: 图谱查询 - -> 找到所有调用 User.email setter的函数 - -> 构建影响树:validate_email() -> update_profile() -> profile_api() - -Step 2: 分词器展示 - -> 对每个受影响的函数,展示具体的调用位置(micro chunk) - -> 用户可以快速review每个调用点的上下文 - -Step 3: 混合策略提供摘要 - -> 每个函数的docstring说明其业务意图 - -> LLM生成的"此函数在email验证中的角色"描述 -``` - -### 4.3 组合实施的量化效果预测 - -**假设场景**:一个10万行的Python代码库 - -| 指标 | 当前 | +混合策略 | +分词器 | +图谱(全部) | -|------|------|----------|---------|------------| -| 搜索准确率 | 70% | 80% (+10%) | 92% (+12%) | 95% (+3%) | -| 索引时间 | 10min | 7min (-30%) | 12min (+20%) | 50min (+300%) | -| 存储空间 | 1GB | 0.8GB (-20%) | 2GB (+100%) | 6GB (+200%) | -| 查询延迟 | 50ms | 50ms | 60ms (+20%) | 100ms (+100%) | -| 能力维度 | 搜索 | 搜索 | 搜索 | 搜索+理解+分析 | - -**关键洞察**: -- 混合策略是"降本增效",提升质量同时降低成本 -- 分词器是"增效",显著提升搜索精度,但有成本 -- 图谱是"开新维度",不只是优化,而是全新能力 - ---- - -## 5. 优先级重排与实施路线图 - -### 5.1 重排后的优先级 - -**P0 - 立即启动(Q1)**:Docstring与LLM混合策略 -- ✅ ROI最高(成本-40%,质量+15%) -- ✅ 风险最低 -- ✅ 6-8周可见效 -- ✅ 为后续方案铺路(提供高质量元数据) - -**P1 - Q2启动**:多层次分词器 -- ✅ 投入产出比高 -- ✅ 技术可行性已验证 -- ✅ 7-10周实现核心功能 -- ⚠️ 依赖P0完成后的稳定基础 - -**P2 - 需原型验证后决定**:静态分析语义图谱 -- 🔬 **前置条件**:NameResolver原型验证通过(4-6周) -- ⚠️ 如果验证失败,调整范围或推迟 -- ✅ 如果验证成功,Q3-Q4启动正式开发(24-30周) - -### 5.2 详细实施路线图 - -``` -Q1 2024 (Week 1-13) -├─ Week 1-8: 实施Docstring混合策略 -│ ├─ Week 1-2: DocstringExtractor + QualityEvaluator -│ ├─ Week 3-4: HybridEnhancer核心逻辑 -│ ├─ Week 5-6: 真实项目测试 + 调优 -│ └─ Week 7-8: 多语言支持 + 发布 -│ -├─ Week 4-10: (并行) NameResolver原型验证 -│ ├─ Week 4-6: 原型开发 -│ ├─ Week 7-8: 在3个真实项目上测试 -│ ├─ Week 9-10: 评估报告 + 决策 -│ └─ 决策点:图谱项目是否继续? -│ -└─ Week 9-13: 分词器Phase 0 (准备工作) - ├─ 数据库设计和迁移脚本 - ├─ 基础AST分析模块 - └─ 测试环境搭建 - -Q2 2024 (Week 14-26) -├─ Week 14-23: 实施多层次分词器 -│ ├─ Week 14-16: MacroChunker + MicroChunker -│ ├─ Week 17-19: HierarchicalVectorStore -│ ├─ Week 20-21: LLM分层增强集成 -│ └─ Week 22-23: 性能优化 + 发布 -│ -└─ Week 24-26: 评估和规划 - ├─ 收集用户反馈 - ├─ 调整图谱计划(如果原型通过) - └─ 制定Q3-Q4详细计划 - -Q3-Q4 2024 (Week 27-52) - 条件性启动图谱 -├─ 如果NameResolver原型通过: -│ ├─ Week 27-34: 基础调用图构建 -│ ├─ Week 35-38: LLM语义增强 -│ ├─ Week 39-44: 高级查询功能 -│ └─ Week 45-52: 优化与稳定 -│ -└─ 如果原型失败: - ├─ 调研集成现有工具(jedi/pyright) - ├─ 或调整范围(只做本地调用图) - └─ 或推迟到2025,投入更多资源 -``` - ---- - -## 6. 具体行动建议 - -### 6.1 立即可执行(本周) - -**行动1**:启动Docstring混合策略开发 -```bash -# 创建开发分支 -git checkout -b feature/docstring-hybrid-strategy - -# 目录结构 -src/codexlens/semantic/ - ├── docstring_extractor.py # NEW - ├── quality_evaluator.py # NEW - ├── hybrid_enhancer.py # NEW (替代llm_enhancer.py) - └── llm_enhancer.py # 保留作为后端 - -# 第一周任务 -- [ ] 实现PythonDocstringExtractor (基于tree-sitter) -- [ ] 实现DocstringQuality评估器 -- [ ] 编写单元测试(覆盖率>80%) -``` - -**行动2**:建立评估基准 -```python -# scripts/evaluate_docstring_quality.py -""" -在3个真实项目上评估docstring质量分布 - -目标项目: -1. 内部项目A (高质量docstring, Google style) -2. 开源项目B (中等质量docstring, NumPy style) -3. 遗留代码C (低质量或无docstring) - -输出: -- 质量分布统计(HIGH/MEDIUM/LOW/MISSING百分比) -- 评估器准确率(vs 人工标注) -- 潜在节省成本估算 -""" -``` - -### 6.2 需要调研(2周内) - -**调研1**:NameResolver技术选型 -``` -目标:评估集成现有工具的可行性 - -方案A:集成jedi - - API文档:https://jedi.readthedocs.io/ - - 评估点:能否获取函数调用的目标定义? - - 实验:写一个100行的测试脚本,调用jedi API - -方案B:集成pyright (通过CLI) - - pyright --verifytypes可以输出类型信息 - - 评估点:能否解析其输出构建调用图? - - 实验:在测试项目上运行pyright,分析输出 - -方案C:自研(退路) - - 只处理简单场景(本地调用+直接导入) - - 明确标注"不支持复杂导入" -``` - -**调研2**:图数据库选型 -``` -目标:对比SQLite vs Neo4j vs NetworkX - -测试场景: -- 1000个节点,5000条边的调用图 -- 查询1: 找到函数A的所有调用者(广度优先,深度3) -- 查询2: 找到函数A和函数B之间的最短路径 -- 查询3: 找到所有孤立的节点(未被调用的函数) - -评估指标: -- 查询性能(<100ms?) -- 存储空间 -- 维护复杂度 -- 是否支持事务 -``` - -### 6.3 必须做的原型验证(4-6周) - -**原型1**:NameResolver验证原型 -```python -# prototypes/name_resolver_validation/ - -测试项目:选择一个中等复杂度的开源项目 - - requests库 (约10k行,30+文件) 或 - - flask库 (约15k行,50+文件) - -验证步骤: -1. 手动标注100个函数调用关系(ground truth) -2. 运行原型,提取调用图 -3. 对比结果,计算准确率/召回率 - -成功标准: -- 准确率 > 70% -- 召回率 > 60% -- 假阳性率 < 20% - -失败后续: -- 如果< 50%准确率:暂停图谱项目,调研集成方案 -- 如果50-70%:调整范围,只做高置信度的简单调用 -- 如果> 70%:继续,但投入更多资源优化 -``` - -**原型2**:层级化检索权重实验 -```python -# prototypes/hierarchical_search_weights/ - -实验设计: -1. 手动构建一个包含10个函数的测试代码库 -2. 为每个函数创建macro chunk + micro chunks -3. 准备20个搜索查询,人工标注期望结果 -4. 测试不同的权重策略: - - Strategy 1: {macro: 1.0, micro: 0.5} - - Strategy 2: {macro: 1.0, micro: 0.8} - - Strategy 3: {macro: 1.0, micro: 1.0} - - Strategy 4: {macro: 0.8, micro: 1.0} - -评估指标: -- NDCG@10 (Normalized Discounted Cumulative Gain) -- MRR (Mean Reciprocal Rank) -- User preference survey (if possible) - -输出: -- 最佳权重策略 -- 权重参数的敏感性分析 -``` - ---- - -## 7. 风险评估与缓解 - -### 7.1 高风险项 - -| 风险 | 方案 | 影响 | 概率 | 缓解措施 | -|------|------|------|------|----------| -| NameResolver准确率<50% | 图谱 | 🔴 极高 | 40% | 前置原型验证;准备集成jedi的备选方案 | -| 分词器micro chunks过多 | 分词器 | 🟡 中 | 30% | 自适应阈值;选择性向量化 | -| LLM成本超预算 | 全部 | 🟡 中 | 25% | 混合策略优先;批量处理优化 | -| 图谱增量更新复杂度 | 图谱 | 🟡 中 | 50% | V1不支持增量,全量重建;V2再优化 | - -### 7.2 缓解策略矩阵 - -**对于NameResolver风险**: -``` -Plan A (理想): 自研达到70%+准确率 - - 投入: 1名高级工程师 × 6周 - - 成功率: 40% - -Plan B (务实): 集成jedi或pyright - - 投入: 2周调研 + 4周集成 - - 成功率: 70% - - 限制: 依赖外部工具,可能有版本兼容问题 - -Plan C (保底): 限定范围(只做本地调用图) - - 投入: 4周 - - 成功率: 95% - - 限制: 功能大幅缩水,但仍有价值 -``` - -**对于成本控制风险**: -``` -成本监控dashboard: - - 实时显示LLM调用次数和费用 - - 按策略分类(full-gen / refine / keywords-only) - - 告警阈值:日费用>$50 或 月费用>$1000 - -成本优化开关: - - 在配置中设置每日预算上限 - - 超过后自动降级(跳过micro chunks的LLM增强) - - 批量处理大小动态调整 -``` - ---- - -## 8. 总结与最终建议 - -### 8.1 核心结论 - -1. **Docstring混合策略**:✅ **立即启动** - - 完善性最高(8.0/10) - - 技术风险最低 - - ROI最高(成本-40%,质量+15%) - - 6-8周可见效 - -2. **多层次分词器**:✅ **Q2启动** - - 完善性高(8.0/10) - - 技术可行性已验证 - - 搜索质量提升30%+ - - 需在P0完成后启动 - -3. **静态分析语义图谱**:⚠️ **需原型验证** - - 完善性中等(6.0/10) - - 技术风险极高(名称解析难度) - - 潜力巨大(全新能力维度) - - **必须先验证NameResolver可行性** - -### 8.2 最终建议的实施顺序 - -``` -Stage 1 (立即): Docstring混合策略 (6-8周) - ├─ 快速降低成本 - ├─ 提升元数据质量 - └─ 为后续打基础 - -Stage 2 (并行): NameResolver原型 (4-6周) - ├─ 决定图谱项目的命运 - ├─ 如果失败,调整或推迟 - └─ 如果成功,Q3正式启动 - -Stage 3 (Q2): 多层次分词器 (7-10周) - ├─ 显著提升搜索精度 - ├─ 为图谱提供细粒度节点 - └─ 用户体验质的飞跃 - -Stage 4 (Q3-Q4, 条件性): 静态分析图谱 (24-30周) - ├─ 如果Stage 2成功,则启动 - ├─ 从简单做起(本地调用图) - └─ 逐步增强(跨文件、LLM语义) -``` - -### 8.3 成功的关键 - -1. **风险前置**:不要盲目启动图谱,必须先验证核心技术假设 -2. **迭代交付**:每个方案都要尽早发布可用版本,收集反馈 -3. **成本控制**:实时监控LLM费用,设置预算上限和降级机制 -4. **数据驱动**:用真实项目数据验证假设,不要依赖理论推导 -5. **务实落地**:完美是优秀的敌人,先做到70分可用,再优化到90分 - -### 8.4 量化预期(全部实施后) - -**假设**:所有三个方案都成功实施 - -| 指标 | 当前基线 | 预期目标 | 提升幅度 | -|------|---------|---------|---------| -| 搜索准确率 | 70% | **95%** | +25% | -| 搜索覆盖率 | 80% | **98%** | +18% | -| 元数据质量 | 75% | **92%** | +17% | -| LLM成本 | $1000/月 | **$600/月** | -40% | -| 索引速度 | 10min | **15min** | +50% (可接受) | -| 新能力 | 搜索 | **搜索+理解+分析** | 质的飞跃 | - ---- - -**报告完成时间**: 81.2秒 -**评估工具**: Gemini 2.5 Pro -**建议复审周期**: 每个阶段结束后进行复盘和调整 diff --git a/codex-lens/docs/HYBRID_SEARCH_ARCHITECTURE.md b/codex-lens/docs/HYBRID_SEARCH_ARCHITECTURE.md deleted file mode 100644 index 8073b6d7..00000000 --- a/codex-lens/docs/HYBRID_SEARCH_ARCHITECTURE.md +++ /dev/null @@ -1,540 +0,0 @@ -# Hybrid Search Architecture for CodexLens - -> Embedding + Real-time LSP + Clustering + Reranking Pipeline - -## Overview - -This document describes the architecture for a hybrid intelligent code search system that combines: -1. **Low-dimensional embedding model** for semantic search -2. **Real-time LSP integration** for code structure analysis -3. **Graph-based clustering** for result organization -4. **Multi-factor reranking** for intelligent sorting - -**Key Constraint**: Must use real-time LSP servers, NOT pre-indexed data. - -## Architecture Diagram - -``` -┌─────────────────────────────────────────────────────────────────────────────┐ -│ HybridSearchEngine │ -│ ┌─────────────────────────────────────────────────────────────────────┐ │ -│ │ 5-Stage Search Pipeline │ │ -│ │ │ │ -│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌────┐│ │ -│ │ │ Stage 1 │──▶│ Stage 2 │──▶│ Stage 3 │──▶│ Stage 4 │──▶│ S5 ││ │ -│ │ │ Vector │ │ LSP │ │ Graph │ │Clustering│ │Rank││ │ -│ │ │ Search │ │Expansion │ │ Building │ │ +Filter │ │ ││ │ -│ │ └──────────┘ └──────────┘ └──────────┘ └──────────┘ └────┘│ │ -│ └─────────────────────────────────────────────────────────────────────┘ │ -│ │ -│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────────────────┐ │ -│ │VectorSearchSvc │ │ LspBridge │ │ GraphBuilder │ │ -│ │ │ │ │ │ │ │ -│ │ • Embedding │ │ • get_refs() │ │ • build_from_seeds() │ │ -│ │ • FAISS/HNSW │ │ • get_def() │ │ • add_relationships() │ │ -│ │ • search() │ │ • get_calls() │ │ • CodeAssociationGraph │ │ -│ └────────┬────────┘ └────────┬────────┘ └─────────────────────────────┘ │ -│ │ │ │ -└───────────┼────────────────────┼────────────────────────────────────────────┘ - │ │ - ▼ ▼ - ┌───────────────┐ ┌───────────────────────────────────────┐ - │ Embedding │ │ LanguageServerMultiplexer │ - │ Model (local) │ │ (from REAL_LSP_SERVER_PLAN.md) │ - │ │ │ │ - │ sentence- │ │ ┌─────┐ ┌─────┐ ┌─────┐ ┌──────────┐│ - │ transformers │ │ │pylsp│ │gopls│ │tssvr│ │rust-anlzr││ - │ │ │ └─────┘ └─────┘ └─────┘ └──────────┘│ - └───────────────┘ └───────────────────────────────────────┘ -``` - -## Core Components - -### 1. HybridSearchEngine (`hybrid_search/engine.py`) - -**Role**: Main orchestrator coordinating all services - -```python -class HybridSearchEngine: - def __init__(self): - self.vector_service: VectorSearchService - self.lsp_bridge: LspBridge - self.graph_builder: GraphBuilder - self.clustering_service: ClusteringService - self.ranking_service: RankingService - - async def search(self, query: str, top_k: int = 10) -> List[SearchResultCluster]: - # Stage 1: Vector search for seeds - seeds = await self.vector_service.search(query, top_k=top_k * 2) - - # Stage 2-3: LSP expansion + Graph building - graph = await self.graph_builder.build_from_seeds(seeds, self.lsp_bridge) - - # Stage 4: Clustering + Filtering - clusters = self.clustering_service.cluster(graph) - clusters = self.clustering_service.filter_noise(clusters) - - # Stage 5: Reranking - ranked = self.ranking_service.rerank(clusters, seeds, query) - - return ranked[:top_k] -``` - -### 2. Data Structures (`hybrid_search/data_structures.py`) - -```python -@dataclass -class CodeSymbolNode: - """Graph node representing a code symbol""" - id: str # Unique: file_path:name:line - name: str # Symbol name - kind: str # function, class, method, variable - file_path: str # Absolute file path - range: Range # Start/end line and character - embedding: Optional[List[float]] = None - raw_code: str = "" - docstring: str = "" - -@dataclass -class CodeAssociationGraph: - """Graph of code relationships""" - nodes: Dict[str, CodeSymbolNode] - edges: List[Tuple[str, str, str]] # (from_id, to_id, relationship_type) - # relationship_type: 'calls', 'references', 'inherits', 'imports' - - def to_networkx(self) -> nx.DiGraph: - """Convert to NetworkX for algorithms""" - ... - -@dataclass -class SearchResultCluster: - """Clustered search result""" - cluster_id: str - score: float - title: str # AI-generated summary (optional) - symbols: List[CodeSymbolNode] - metadata: Dict[str, Any] -``` - -### 3. VectorSearchService (`services/vector_search.py`) - -**Role**: Semantic search using embeddings - -```python -class VectorSearchService: - def __init__(self, model_name: str = "all-MiniLM-L6-v2"): - self.model = SentenceTransformer(model_name) # 384-dim, fast - self.index: faiss.IndexFlatIP # or hnswlib for larger scale - self.id_to_symbol: Dict[str, CodeSymbolNode] - - async def index_codebase(self, symbols: List[CodeSymbolNode]): - """Build/update vector index from symbols""" - texts = [f"{s.name} {s.docstring} {s.raw_code[:500]}" for s in symbols] - embeddings = self.model.encode(texts, normalize_embeddings=True) - self.index.add(embeddings) - - async def search(self, query: str, top_k: int) -> List[CodeSymbolNode]: - """Find semantically similar symbols""" - query_vec = self.model.encode([query], normalize_embeddings=True) - scores, indices = self.index.search(query_vec, top_k) - return [self.id_to_symbol[i] for i in indices[0]] -``` - -**Embedding Model Selection**: -| Model | Dimensions | Speed | Quality | -|-------|-----------|-------|---------| -| all-MiniLM-L6-v2 | 384 | Fast | Good | -| all-mpnet-base-v2 | 768 | Medium | Better | -| CodeBERT | 768 | Medium | Code-optimized | - -### 4. LspBridge (`services/lsp_bridge.py`) - -**Role**: Interface to real-time language servers via LanguageServerMultiplexer - -```python -class LspBridge: - def __init__(self, multiplexer_url: str = "http://localhost:3458"): - self.multiplexer_url = multiplexer_url - self.cache: Dict[str, CacheEntry] = {} # file_path -> (mtime, data) - self.session = aiohttp.ClientSession() - - async def get_references(self, symbol: CodeSymbolNode) -> List[Location]: - """Get all references to a symbol (real-time LSP)""" - cache_key = f"refs:{symbol.id}" - if self._is_cached(cache_key, symbol.file_path): - return self.cache[cache_key].data - - response = await self._lsp_request("textDocument/references", { - "textDocument": {"uri": f"file://{symbol.file_path}"}, - "position": {"line": symbol.range.start.line, - "character": symbol.range.start.character}, - "context": {"includeDeclaration": True} - }) - - locations = self._parse_locations(response) - self._cache(cache_key, symbol.file_path, locations) - return locations - - async def get_call_hierarchy(self, symbol: CodeSymbolNode) -> List[CallHierarchyItem]: - """Get incoming/outgoing calls (if supported by language server)""" - try: - # Prepare call hierarchy - items = await self._lsp_request("textDocument/prepareCallHierarchy", {...}) - if not items: - # Fallback to references if callHierarchy not supported - return await self._fallback_to_references(symbol) - - # Get incoming calls - incoming = await self._lsp_request("callHierarchy/incomingCalls", - {"item": items[0]}) - return incoming - except LspCapabilityNotSupported: - return await self._fallback_to_references(symbol) - - async def get_definition(self, symbol: CodeSymbolNode) -> Optional[Location]: - """Get symbol definition location""" - ... - - async def get_hover(self, symbol: CodeSymbolNode) -> Optional[str]: - """Get hover documentation""" - ... -``` - -**Caching Strategy**: -- Cache key: `{operation}:{symbol_id}` -- Invalidation: Check file modification time -- TTL: 5 minutes for frequently accessed files - -**Concurrency Control**: -- Max concurrent LSP requests: 10 -- Request timeout: 2 seconds -- Batch requests where possible - -### 5. GraphBuilder (`graph/builder.py`) - -**Role**: Build code association graph from seeds using LSP - -```python -class GraphBuilder: - def __init__(self, max_depth: int = 2, max_nodes: int = 100): - self.max_depth = max_depth - self.max_nodes = max_nodes - - async def build_from_seeds( - self, - seeds: List[CodeSymbolNode], - lsp_bridge: LspBridge - ) -> CodeAssociationGraph: - """Build association graph by expanding from seed nodes""" - graph = CodeAssociationGraph() - visited: Set[str] = set() - queue: List[Tuple[CodeSymbolNode, int]] = [(s, 0) for s in seeds] - - # Parallel expansion with semaphore - sem = asyncio.Semaphore(10) - - async def expand_node(node: CodeSymbolNode, depth: int): - if node.id in visited or depth > self.max_depth: - return - if len(graph.nodes) >= self.max_nodes: - return - - visited.add(node.id) - graph.add_node(node) - - async with sem: - # Get relationships in parallel - refs, calls = await asyncio.gather( - lsp_bridge.get_references(node), - lsp_bridge.get_call_hierarchy(node), - return_exceptions=True - ) - - # Add edges - for ref in refs: - ref_node = await self._location_to_node(ref, lsp_bridge) - graph.add_edge(node.id, ref_node.id, "references") - queue.append((ref_node, depth + 1)) - - for call in calls: - call_node = await self._call_to_node(call, lsp_bridge) - graph.add_edge(call_node.id, node.id, "calls") - queue.append((call_node, depth + 1)) - - # BFS expansion - while queue and len(graph.nodes) < self.max_nodes: - batch = queue[:10] - queue = queue[10:] - await asyncio.gather(*[expand_node(n, d) for n, d in batch]) - - return graph -``` - -### 6. ClusteringService (`clustering/algorithms.py`) - -**Role**: Group related code symbols and filter noise - -```python -class ClusteringService: - def __init__(self, resolution: float = 1.0): - self.resolution = resolution # Higher = smaller clusters - - def cluster(self, graph: CodeAssociationGraph) -> List[SearchResultCluster]: - """Apply Louvain community detection""" - nx_graph = graph.to_networkx() - - # Louvain algorithm - communities = community_louvain.best_partition( - nx_graph, - resolution=self.resolution - ) - - # Group nodes by community - clusters: Dict[int, List[CodeSymbolNode]] = defaultdict(list) - for node_id, community_id in communities.items(): - clusters[community_id].append(graph.nodes[node_id]) - - return [ - SearchResultCluster( - cluster_id=f"cluster_{cid}", - symbols=nodes, - score=0.0, # Will be set by RankingService - title="", - metadata={"size": len(nodes)} - ) - for cid, nodes in clusters.items() - ] - - def filter_noise(self, clusters: List[SearchResultCluster]) -> List[SearchResultCluster]: - """Remove noisy clusters and symbols""" - filtered = [] - for cluster in clusters: - # Filter high-degree generic nodes - cluster.symbols = [ - s for s in cluster.symbols - if not self._is_generic_symbol(s) - ] - - # Keep clusters with minimum size - if len(cluster.symbols) >= 2: - filtered.append(cluster) - - return filtered - - def _is_generic_symbol(self, symbol: CodeSymbolNode) -> bool: - """Check if symbol is too generic (log, print, etc.)""" - generic_names = {'log', 'print', 'debug', 'error', 'warn', - 'get', 'set', 'init', '__init__', 'toString'} - return symbol.name.lower() in generic_names -``` - -### 7. RankingService (`ranking/service.py`) - -**Role**: Multi-factor intelligent reranking - -```python -@dataclass -class RankingWeights: - text_relevance: float = 0.4 # w1 - graph_centrality: float = 0.35 # w2 - structural_proximity: float = 0.25 # w3 - -class RankingService: - def __init__(self, weights: RankingWeights = None): - self.weights = weights or RankingWeights() - - def rerank( - self, - clusters: List[SearchResultCluster], - seeds: List[CodeSymbolNode], - query: str - ) -> List[SearchResultCluster]: - """Rerank clusters using multi-factor scoring""" - seed_ids = {s.id for s in seeds} - - for cluster in clusters: - # Build cluster subgraph for centrality - subgraph = self._build_subgraph(cluster) - pagerank = nx.pagerank(subgraph) - - for symbol in cluster.symbols: - # Factor 1: Text relevance (from vector search) - text_score = self._compute_text_relevance(symbol, query) - - # Factor 2: Graph centrality (PageRank in cluster) - centrality_score = pagerank.get(symbol.id, 0.0) - - # Factor 3: Structural proximity to seeds - proximity_score = self._compute_proximity(symbol, seed_ids, subgraph) - - # Combined score - symbol.score = ( - self.weights.text_relevance * text_score + - self.weights.graph_centrality * centrality_score + - self.weights.structural_proximity * proximity_score - ) - - # Cluster score = max symbol score - cluster.score = max(s.score for s in cluster.symbols) - cluster.symbols.sort(key=lambda s: s.score, reverse=True) - - # Sort clusters by score - clusters.sort(key=lambda c: c.score, reverse=True) - return clusters - - def _compute_proximity( - self, - symbol: CodeSymbolNode, - seed_ids: Set[str], - graph: nx.DiGraph - ) -> float: - """Compute proximity score based on shortest path to seeds""" - if symbol.id in seed_ids: - return 1.0 - - min_distance = float('inf') - for seed_id in seed_ids: - try: - distance = nx.shortest_path_length(graph, seed_id, symbol.id) - min_distance = min(min_distance, distance) - except nx.NetworkXNoPath: - continue - - if min_distance == float('inf'): - return 0.0 - - # Inverse distance scoring (closer = higher) - return 1.0 / (1.0 + min_distance) -``` - -## API Design - -### Endpoint: `POST /api/v1/hybrid-search` - -**Request**: -```json -{ - "query": "user authentication flow", - "top_k": 10, - "config_overrides": { - "ranking_weights": {"w1": 0.5, "w2": 0.3, "w3": 0.2}, - "max_graph_depth": 2, - "clustering_resolution": 1.0 - } -} -``` - -**Response**: -```json -{ - "query_id": "hs-20250120-001", - "execution_time_ms": 1250, - "results": [ - { - "cluster_id": "cluster_0", - "score": 0.92, - "title": "User Authentication Handler", - "symbols": [ - { - "id": "src/auth/handler.py:authenticate:45", - "name": "authenticate", - "kind": "function", - "file_path": "src/auth/handler.py", - "range": {"start": {"line": 45, "char": 0}, "end": {"line": 78, "char": 0}}, - "score": 0.95, - "raw_code": "async def authenticate(request: Request):\n ..." - }, - { - "id": "src/auth/handler.py:validate_token:80", - "name": "validate_token", - "kind": "function", - "file_path": "src/auth/handler.py", - "score": 0.88, - "raw_code": "def validate_token(token: str) -> bool:\n ..." - } - ] - } - ] -} -``` - -## Implementation Priorities - -### P0 - Core Infrastructure (Week 1-2) -1. **HybridSearchEngine skeleton** - Basic orchestration without all features -2. **LspBridge with caching** - Connect to LanguageServerMultiplexer -3. **GraphBuilder basic** - Seed expansion with references only -4. **Integration test** - Verify LSP communication works - -### P1 - Search Pipeline (Week 2-3) -1. **VectorSearchService** - Embedding model + FAISS index -2. **ClusteringService** - Louvain algorithm + noise filtering -3. **End-to-end pipeline** - Query to clustered results - -### P2 - Ranking & API (Week 3-4) -1. **RankingService** - Multi-factor scoring -2. **API endpoint** - FastAPI integration -3. **Performance optimization** - Caching, parallelization, timeouts -4. **Configuration system** - Dynamic weight adjustment - -## Performance Targets - -| Metric | Target | Strategy | -|--------|--------|----------| -| End-to-end latency | < 2s | Parallel LSP calls, aggressive caching | -| Vector search | < 100ms | FAISS with GPU (optional) | -| LSP expansion | < 1s | Max 10 concurrent requests, 2s timeout | -| Clustering | < 200ms | Limit graph size to 100 nodes | -| Reranking | < 100ms | Pre-computed embeddings | - -## Dependencies - -### External -- LanguageServerMultiplexer (from REAL_LSP_SERVER_PLAN.md) -- Language servers: pylsp, tsserver, gopls, rust-analyzer - -### Python Libraries -- `sentence-transformers` - Embedding models -- `faiss-cpu` or `hnswlib` - Vector indexing -- `networkx` - Graph algorithms -- `python-louvain` - Community detection -- `aiohttp` - Async HTTP client - -## File Structure - -``` -src/codexlens/ -├── hybrid_search/ -│ ├── __init__.py -│ ├── engine.py # HybridSearchEngine -│ ├── pipeline.py # Pipeline stage definitions -│ └── data_structures.py # CodeSymbolNode, Graph, Cluster -├── services/ -│ ├── vector_search.py # VectorSearchService -│ └── lsp_bridge.py # LspBridge -├── graph/ -│ └── builder.py # GraphBuilder -├── clustering/ -│ └── algorithms.py # ClusteringService -├── ranking/ -│ └── service.py # RankingService -├── api/ -│ └── endpoints.py # API routes -└── configs/ - └── hybrid_search_config.py -``` - -## Risk Mitigation - -| Risk | Impact | Mitigation | -|------|--------|------------| -| LSP timeout | High | Fallback to vector-only results | -| LSP not available | High | Graceful degradation to CodexLens index | -| Large codebases | Medium | Limit graph expansion, pagination | -| Language server crash | Medium | Auto-restart, circuit breaker | -| Clustering quality | Low | Tunable resolution parameter | - ---- - -*Generated from Gemini analysis (Session: 1768836775699-gemini)* -*Date: 2025-01-20* diff --git a/codex-lens/docs/IMPLEMENTATION_SUMMARY.md b/codex-lens/docs/IMPLEMENTATION_SUMMARY.md deleted file mode 100644 index 08dd775d..00000000 --- a/codex-lens/docs/IMPLEMENTATION_SUMMARY.md +++ /dev/null @@ -1,363 +0,0 @@ -# CodexLens Real LSP Implementation - Summary - -> **Date**: 2026-01-19 -> **Status**: Planning Complete, Implementation Ready -> **Focus**: Real LSP Server + VSCode Bridge Integration - ---- - -## ✅ Completed Work - -### 1. Planning Documents - -#### a. Main Implementation Plan -**File**: `docs/REAL_LSP_SERVER_PLAN.md` - -**Content**: -- Complete architecture design for real LSP server -- 5-phase implementation plan -- Multi-language support strategy (TypeScript, Python, Go, Rust, Java, C/C++) -- Language server multiplexer design -- Position tolerance feature (cclsp-like) -- MCP integration layer - -**Key Decisions**: -- Use `pygls` library for LSP implementation -- Support 6+ language servers via multiplexer -- Implement position tolerance for fuzzy AI-generated positions -- Three integration paths: Standalone LSP, VSCode Bridge, Index-based fallback - -#### b. VSCode Bridge Implementation (Appendix A) -**Included in**: `docs/REAL_LSP_SERVER_PLAN.md` - -**Content**: -- HTTP-based VSCode extension bridge -- MCP tool integration (vscode_lsp) -- Complete architecture diagram -- API endpoint specifications -- Comparison with standalone LSP approach - -### 2. VSCode Bridge Extension - -#### Created Files: -1. **`ccw-vscode-bridge/package.json`** - - VSCode extension manifest - - Dependencies: @types/node, @types/vscode, typescript - -2. **`ccw-vscode-bridge/tsconfig.json`** - - TypeScript compilation configuration - - Target: ES2020, CommonJS modules - -3. **`ccw-vscode-bridge/src/extension.ts`** - - HTTP server on port 3457 - - 4 API endpoints: - - `POST /get_definition` - - `POST /get_references` - - `POST /get_hover` - - `POST /get_document_symbols` - - VSCode API integration via `vscode.commands.executeCommand` - -4. **`ccw-vscode-bridge/.vscodeignore`** - - Build artifact exclusion rules - -5. **`ccw-vscode-bridge/README.md`** - - Installation & usage instructions - - API endpoint documentation - -#### Features: -- ✅ Real-time VSCode LSP integration -- ✅ HTTP REST API for external tools -- ✅ CORS support -- ✅ Error handling -- ✅ Automatic VSCode feature detection - -### 3. CCW MCP Tool - -#### Created File: -**`ccw/src/tools/vscode-lsp.ts`** - -**Features**: -- ✅ 4 LSP actions: get_definition, get_references, get_hover, get_document_symbols -- ✅ Zod schema validation -- ✅ HTTP client with timeout (10s) -- ✅ Connection retry logic -- ✅ Comprehensive error messages - -**Parameters**: -- `action` (required): LSP action type -- `file_path` (required): Absolute file path -- `line` (optional): Line number (1-based) -- `character` (optional): Character position (1-based) - -#### Integration: -**Modified File**: `ccw/src/tools/index.ts` - -- ✅ Imported `vscodeLspMod` -- ✅ Registered tool via `registerTool(toLegacyTool(vscodeLspMod))` -- ✅ Available in MCP server tool list - ---- - -## 📋 Implementation Architecture - -### Three Integration Paths - -``` -Path 1: VSCode Bridge (✅ Implemented) -───────────────────────────────────── -Claude Code → vscode_lsp MCP tool → HTTP → ccw-vscode-bridge → VSCode API → Language Servers - -Path 2: Standalone LSP Server (📝 Planned) -────────────────────────────────────────── -Any LSP Client → codexlens-lsp → Language Server Multiplexer → Language Servers - -Path 3: Index-Based (✅ Existing) -───────────────────────────────── -Claude Code → codex_lens_lsp → Python API → SQLite Index → Cached Results -``` - -### Smart Routing Strategy - -```javascript -// Priority: VSCode Bridge → Standalone LSP → Index-based -if (vscodeBridgeAvailable) { - return useVSCodeBridge(); -} else if (standaloneLSPAvailable) { - return useStandaloneLSP(); -} else { - return useIndexBased(); -} -``` - ---- - -## 🎯 Next Steps - -### Immediate Actions (Phase 1) - -1. **Test VSCode Bridge** - ```bash - cd ccw-vscode-bridge - npm install - npm run compile - # Press F5 in VSCode to launch extension - ``` - -2. **Test vscode_lsp Tool** - ```bash - # Start CCW MCP server - cd ccw - npm run mcp - - # Test via MCP client - { - "tool": "vscode_lsp", - "arguments": { - "action": "get_definition", - "file_path": "/path/to/file.ts", - "line": 10, - "character": 5 - } - } - ``` - -3. **Document Testing Results** - - Create test reports - - Benchmark latency - - Validate accuracy - -### Medium-Term Goals (Phase 2-3) - -1. **Implement Standalone LSP Server** - - Setup `codexlens-lsp` project structure - - Implement language server multiplexer - - Add core LSP handlers - -2. **Add Position Tolerance** - - Implement fuzzy position matching - - Test with AI-generated positions - -3. **Create Integration Tests** - - Unit tests for each component - - E2E tests with real language servers - - Performance benchmarks - -### Long-Term Goals (Phase 4-5) - -1. **MCP Context Enhancement** - - Integrate LSP results into MCP context - - Hook system for Claude Code - -2. **Advanced Features** - - Code actions - - Formatting - - Rename support - -3. **Production Deployment** - - Package VSCode extension to .vsix - - Publish to VS Code marketplace - - Create installation scripts - ---- - -## 📊 Project Status Matrix - -| Component | Status | Files | Tests | Docs | -|-----------|--------|-------|-------|------| -| VSCode Bridge Extension | ✅ Complete | 5/5 | ⏳ Pending | ✅ Complete | -| vscode_lsp MCP Tool | ✅ Complete | 1/1 | ⏳ Pending | ✅ Complete | -| Tool Registration | ✅ Complete | 1/1 | N/A | N/A | -| Planning Documents | ✅ Complete | 2/2 | N/A | ✅ Complete | -| Standalone LSP Server | 📝 Planned | 0/8 | 0/12 | ✅ Complete | -| Integration Tests | 📝 Planned | 0/3 | 0/15 | ⏳ Pending | - ---- - -## 🔧 Development Environment - -### Prerequisites - -**For VSCode Bridge**: -- Node.js ≥ 18 -- VSCode ≥ 1.80 -- TypeScript ≥ 5.0 - -**For Standalone LSP**: -- Python ≥ 3.8 -- pygls ≥ 1.3.0 -- Language servers: - - TypeScript: `npm i -g typescript-language-server` - - Python: `pip install python-lsp-server` - - Go: `go install golang.org/x/tools/gopls@latest` - - Rust: `rustup component add rust-analyzer` - -### Installation Commands - -```bash -# VSCode Bridge -cd ccw-vscode-bridge -npm install -npm run compile - -# CCW MCP (already setup) -cd ccw -npm install - -# Future: Standalone LSP -cd codex-lens -pip install -e ".[lsp]" -``` - ---- - -## 📖 Documentation Index - -| Document | Purpose | Status | -|----------|---------|--------| -| `REAL_LSP_SERVER_PLAN.md` | Complete implementation plan | ✅ | -| `LSP_INTEGRATION_PLAN.md` | Original integration strategy | ✅ | -| `MCP_ENDPOINT_DESIGN.md` | MCP endpoint specifications | ✅ | -| `IMPLEMENTATION_SUMMARY.md` | This document | ✅ | -| `ccw-vscode-bridge/README.md` | Bridge usage guide | ✅ | -| `TESTING_GUIDE.md` | Testing procedures | ⏳ TODO | -| `DEPLOYMENT_GUIDE.md` | Production deployment | ⏳ TODO | - ---- - -## 💡 Key Design Decisions - -### 1. Why Three Integration Paths? - -- **VSCode Bridge**: Easiest setup, leverages VSCode's built-in language servers -- **Standalone LSP**: IDE-agnostic, works with any LSP client -- **Index-based**: Fallback for offline or cached queries - -### 2. Why HTTP for VSCode Bridge? - -- ✅ Simplest cross-process communication -- ✅ No complex IPC/socket management -- ✅ Easy to debug with curl/Postman -- ✅ CORS support for web-based tools - -### 3. Why Port 3457? - -- Unique port unlikely to conflict -- Easy to remember (345-7) -- Same approach as cclsp (uses stdio) - -### 4. Why Not Modify smart_search? - -User feedback: -> "第一种跟当前的符号搜索没区别哎" -> (Method 1 has no difference from current symbol search) - -**Solution**: Implement real LSP server that connects to live language servers, not pre-indexed data. - ---- - -## 🚀 Quick Start Guide - -### Test VSCode Bridge Now - -1. **Install Extension**: - ```bash - cd ccw-vscode-bridge - npm install && npm run compile - code --install-extension . - ``` - -2. **Reload VSCode**: - - Press `Cmd+Shift+P` (Mac) or `Ctrl+Shift+P` (Windows) - - Type "Reload Window" - -3. **Verify Bridge is Running**: - ```bash - curl http://localhost:3457/get_definition \ - -X POST \ - -H "Content-Type: application/json" \ - -d '{"file_path":"/path/to/file.ts","line":10,"character":5}' - ``` - -4. **Test via CCW**: - ```javascript - // In Claude Code or MCP client - await executeTool('vscode_lsp', { - action: 'get_definition', - file_path: '/absolute/path/to/file.ts', - line: 10, - character: 5 - }); - ``` - ---- - -## 📞 Support & Troubleshooting - -### Common Issues - -**Issue**: "Could not connect to VSCode Bridge" -**Solution**: -1. Ensure VSCode is running -2. Check if extension is activated: `Cmd+Shift+P` → "CCW VSCode Bridge" -3. Verify port 3457 is not in use: `lsof -i :3457` - -**Issue**: "No LSP server available" -**Solution**: Open the file in VSCode workspace first - -**Issue**: "File not found" -**Solution**: Use absolute paths, not relative - ---- - -## 📝 Change Log - -### 2026-01-19 - Initial Implementation -- Created VSCode Bridge extension (5 files) -- Implemented vscode_lsp MCP tool -- Registered tool in CCW registry -- Completed planning documentation -- Added comprehensive architecture diagrams - ---- - -**Document End** diff --git a/codex-lens/docs/LLM_REMOVAL_SUMMARY.md b/codex-lens/docs/LLM_REMOVAL_SUMMARY.md deleted file mode 100644 index 30b090d0..00000000 --- a/codex-lens/docs/LLM_REMOVAL_SUMMARY.md +++ /dev/null @@ -1,342 +0,0 @@ -# LLM增强功能移除总结 - -**移除日期**: 2025-12-16 -**执行者**: 用户请求 -**状态**: ✅ 完成 - ---- - -## 📋 移除清单 - -### ✅ 已删除的源代码文件 - -| 文件 | 说明 | -|------|------| -| `src/codexlens/semantic/llm_enhancer.py` | LLM增强核心模块 (900+ lines) | - -### ✅ 已修改的源代码文件 - -| 文件 | 修改内容 | -|------|---------| -| `src/codexlens/cli/commands.py` | 删除 `enhance` 命令 (lines 1050-1227) | -| `src/codexlens/semantic/__init__.py` | 删除LLM相关导出 (lines 35-69) | - -### ✅ 已修改的前端文件(CCW Dashboard) - -| 文件 | 修改内容 | -|------|---------| -| `ccw/src/templates/dashboard-js/components/cli-status.js` | 删除LLM增强设置 (8行)、Semantic Settings Modal (615行)、Metadata Viewer (326行) | -| `ccw/src/templates/dashboard-js/i18n.js` | 删除英文LLM翻译 (26行)、中文LLM翻译 (26行) | -| `ccw/src/templates/dashboard-js/views/cli-manager.js` | 移除LLM badge和设置modal调用 (3行) | - -### ✅ 已删除的测试文件 - -| 文件 | 说明 | -|------|------| -| `tests/test_llm_enhancer.py` | LLM增强单元测试 | -| `tests/test_llm_enhanced_search.py` | LLM vs 纯向量对比测试 (550+ lines) | - -### ✅ 已删除的脚本文件 - -| 文件 | 说明 | -|------|------| -| `scripts/compare_search_methods.py` | 纯向量 vs LLM增强对比脚本 (460+ lines) | -| `scripts/test_misleading_comments.py` | 误导性注释测试脚本 (490+ lines) | -| `scripts/show_llm_analysis.py` | LLM分析展示工具 | -| `scripts/inspect_llm_summaries.py` | LLM摘要检查工具 | - -### ✅ 已删除的文档文件 - -| 文件 | 说明 | -|------|------| -| `docs/LLM_ENHANCED_SEARCH_GUIDE.md` | LLM增强使用指南 (460+ lines) | -| `docs/LLM_ENHANCEMENT_TEST_RESULTS.md` | LLM测试结果文档 | -| `docs/MISLEADING_COMMENTS_TEST_RESULTS.md` | 误导性注释测试结果 | -| `docs/CLI_INTEGRATION_SUMMARY.md` | CLI集成文档(包含enhance命令) | -| `docs/DOCSTRING_LLM_HYBRID_DESIGN.md` | Docstring与LLM混合策略设计 | - -### ✅ 已更新的文档 - -| 文件 | 修改内容 | -|------|---------| -| `docs/IMPLEMENTATION_SUMMARY.md` | 添加LLM移除说明,列出已删除内容 | - -### 📚 保留的设计文档(作为历史参考) - -| 文件 | 说明 | -|------|------| -| `docs/DESIGN_EVALUATION_REPORT.md` | 包含LLM混合策略的技术评估报告 | -| `docs/SEMANTIC_GRAPH_DESIGN.md` | 语义图谱设计(可能提及LLM) | -| `docs/MULTILEVEL_CHUNKER_DESIGN.md` | 多层次分词器设计(可能提及LLM) | - -*这些文档保留作为技术历史参考,不影响当前功能。* - ---- - -## 🔒 移除的功能 - -### CLI命令 - -```bash -# 已移除 - 不再可用 -codexlens enhance [PATH] --tool gemini --batch-size 5 - -# 说明:此命令用于通过CCW CLI调用Gemini/Qwen生成代码摘要 -# 移除原因:减少外部依赖,简化维护 -``` - -### Python API - -```python -# 已移除 - 不再可用 -from codexlens.semantic import ( - LLMEnhancer, - LLMConfig, - SemanticMetadata, - FileData, - EnhancedSemanticIndexer, - create_enhancer, - create_enhanced_indexer, -) - -# 移除的类和函数: -# - LLMEnhancer: LLM增强器主类 -# - LLMConfig: LLM配置类 -# - SemanticMetadata: 语义元数据结构 -# - FileData: 文件数据结构 -# - EnhancedSemanticIndexer: LLM增强索引器 -# - create_enhancer(): 创建增强器的工厂函数 -# - create_enhanced_indexer(): 创建增强索引器的工厂函数 -``` - ---- - -## ✅ 保留的功能 - -### 完全保留的核心功能 - -| 功能 | 状态 | -|------|------| -| **纯向量搜索** | ✅ 完整保留 | -| **语义嵌入生成** | ✅ 完整保留 (`codexlens embeddings-generate`) | -| **语义嵌入状态检查** | ✅ 完整保留 (`codexlens embeddings-status`) | -| **混合搜索引擎** | ✅ 完整保留(exact + fuzzy + vector) | -| **向量存储** | ✅ 完整保留 | -| **语义分块** | ✅ 完整保留 | -| **fastembed集成** | ✅ 完整保留 | - -### 可用的CLI命令 - -```bash -# 生成纯向量嵌入(无需LLM) -codexlens embeddings-generate [PATH] - -# 检查嵌入状态 -codexlens embeddings-status [PATH] - -# 所有搜索命令 -codexlens search [QUERY] --index [PATH] - -# 所有索引管理命令 -codexlens init [PATH] -codexlens update [PATH] -codexlens clean [PATH] -``` - -### 可用的Python API - -```python -# 完全可用 - 纯向量搜索 -from codexlens.semantic import SEMANTIC_AVAILABLE, SEMANTIC_BACKEND -from codexlens.semantic.embedder import Embedder -from codexlens.semantic.vector_store import VectorStore -from codexlens.semantic.chunker import Chunker, ChunkConfig -from codexlens.search.hybrid_search import HybridSearchEngine - -# 示例:纯向量搜索 -engine = HybridSearchEngine() -results = engine.search( - index_path, - query="your search query", - enable_vector=True, - pure_vector=True, # 纯向量模式 -) -``` - ---- - -## 🎯 移除原因 - -### 1. 简化依赖 - -**移除的外部依赖**: -- CCW CLI (npm package) -- Gemini API (需要API密钥) -- Qwen API (可选) - -**保留的依赖**: -- fastembed (ONNX-based,轻量级) -- numpy -- Python标准库 - -### 2. 减少复杂性 - -- **前**: 两种搜索方式(纯向量 + LLM增强) -- **后**: 一种搜索方式(纯向量) -- 移除了900+ lines的LLM增强代码 -- 移除了CLI命令和相关配置 -- 移除了测试和文档 - -### 3. 性能考虑 - -| 方面 | LLM增强 | 纯向量 | -|------|---------|--------| -| **索引速度** | 慢75倍 | 基准 | -| **查询速度** | 相同 | 相同 | -| **准确率** | 相同* | 基准 | -| **成本** | API费用 | 免费 | - -*在测试数据集上准确率相同(5/5),但LLM增强理论上在更复杂场景下可能更好 - -### 4. 维护负担 - -**移除前**: -- 需要维护CCW CLI集成 -- 需要处理API限流和错误 -- 需要测试多个LLM后端 -- 需要维护批处理逻辑 - -**移除后**: -- 单一嵌入引擎(fastembed) -- 无外部API依赖 -- 更简单的错误处理 -- 更容易测试 - ---- - -## 🔍 验证结果 - -### 导入测试 - -```bash -# ✅ 通过 - 语义模块正常 -python -c "from codexlens.semantic import SEMANTIC_AVAILABLE; print(SEMANTIC_AVAILABLE)" -# Output: True - -# ✅ 通过 - 搜索引擎正常 -python -c "from codexlens.search.hybrid_search import HybridSearchEngine; print('OK')" -# Output: OK -``` - -### 代码清洁度验证 - -```bash -# ✅ 通过 - 无遗留LLM引用 -grep -r "llm_enhancer\|LLMEnhancer\|LLMConfig" src/ --include="*.py" -# Output: (空) -``` - -### 测试结果 - -```bash -# ✅ 5/7通过 - 纯向量搜索基本功能正常 -pytest tests/test_pure_vector_search.py -v -# 通过: 5个基本测试 -# 失败: 2个嵌入测试(已知的模型维度不匹配问题,与LLM移除无关) -``` - ---- - -## 📊 统计 - -### 代码删除统计 - -| 类型 | 删除文件数 | 删除行数(估计) | -|------|-----------|-----------------| -| **源代码** | 1 | ~900 lines | -| **CLI命令** | 1 command | ~180 lines | -| **导出清理** | 1 section | ~35 lines | -| **前端代码** | 3 files | ~1000 lines | -| **测试文件** | 2 | ~600 lines | -| **脚本工具** | 4 | ~1500 lines | -| **文档** | 5 | ~2000 lines | -| **总计** | 16 files/sections | ~6200 lines | - -### 依赖简化 - -| 方面 | 移除前 | 移除后 | -|------|--------|--------| -| **外部工具依赖** | CCW CLI, Gemini/Qwen | 无 | -| **Python包依赖** | fastembed, numpy | fastembed, numpy | -| **API依赖** | Gemini/Qwen API | 无 | -| **配置复杂度** | 高(tool, batch_size, API keys) | 低(model profile) | - ---- - -## 🚀 后续建议 - -### 如果需要LLM增强功能 - -1. **从git历史恢复** - ```bash - # 查看删除前的提交 - git log --all --full-history -- "*llm_enhancer*" - - # 恢复特定文件 - git checkout -- src/codexlens/semantic/llm_enhancer.py - ``` - -2. **或使用外部工具** - - 在索引前使用独立脚本生成摘要 - - 将摘要作为注释添加到代码中 - - 然后使用纯向量索引(会包含摘要) - -3. **或考虑轻量级替代方案** - - 使用本地小模型(llama.cpp, ggml) - - 使用docstring提取(无需LLM) - - 使用静态分析生成摘要 - -### 代码库维护建议 - -1. ✅ **保持简单** - 继续使用纯向量搜索 -2. ✅ **优化现有功能** - 改进向量搜索准确性 -3. ✅ **增量改进** - 优化分块策略和嵌入质量 -4. ⚠️ **避免重复** - 如需LLM,先评估是否真正必要 - ---- - -## 📝 文件清单 - -### 删除的文件完整列表 - -``` -src/codexlens/semantic/llm_enhancer.py -tests/test_llm_enhancer.py -tests/test_llm_enhanced_search.py -scripts/compare_search_methods.py -scripts/test_misleading_comments.py -scripts/show_llm_analysis.py -scripts/inspect_llm_summaries.py -docs/LLM_ENHANCED_SEARCH_GUIDE.md -docs/LLM_ENHANCEMENT_TEST_RESULTS.md -docs/MISLEADING_COMMENTS_TEST_RESULTS.md -docs/CLI_INTEGRATION_SUMMARY.md -docs/DOCSTRING_LLM_HYBRID_DESIGN.md -``` - -### 修改的文件 - -``` -src/codexlens/cli/commands.py (删除enhance命令) -src/codexlens/semantic/__init__.py (删除LLM导出) -ccw/src/templates/dashboard-js/components/cli-status.js (删除LLM配置、Settings Modal、Metadata Viewer) -ccw/src/templates/dashboard-js/i18n.js (删除LLM翻译字符串) -ccw/src/templates/dashboard-js/views/cli-manager.js (移除LLM badge和modal调用) -docs/IMPLEMENTATION_SUMMARY.md (添加移除说明) -``` - ---- - -**移除完成时间**: 2025-12-16 -**文档版本**: 1.0 -**验证状态**: ✅ 通过 diff --git a/codex-lens/docs/LSP_INTEGRATION_CHECKLIST.md b/codex-lens/docs/LSP_INTEGRATION_CHECKLIST.md deleted file mode 100644 index 838d5b3a..00000000 --- a/codex-lens/docs/LSP_INTEGRATION_CHECKLIST.md +++ /dev/null @@ -1,316 +0,0 @@ -# codex-lens LSP Integration Execution Checklist - -> Generated: 2026-01-15 -> Based on: Gemini multi-round deep analysis -> Status: Ready for implementation - ---- - -## Phase 1: LSP Server Foundation (Priority: HIGH) - -### 1.1 Create LSP Server Entry Point -- [ ] **Install pygls dependency** - ```bash - pip install pygls - ``` -- [ ] **Create `src/codexlens/lsp/__init__.py`** - - Export: `CodexLensServer`, `start_server` -- [ ] **Create `src/codexlens/lsp/server.py`** - - Class: `CodexLensServer(LanguageServer)` - - Initialize: `ChainSearchEngine`, `GlobalSymbolIndex`, `WatcherManager` - - Lifecycle: Start `WatcherManager` on `initialize` request - -### 1.2 Implement Core LSP Handlers -- [ ] **`textDocument/definition`** handler - - Source: `GlobalSymbolIndex.search()` exact match - - Reference: `storage/global_index.py:173` - - Return: `Location(uri, Range)` - -- [ ] **`textDocument/completion`** handler - - Source: `GlobalSymbolIndex.search(prefix_mode=True)` - - Reference: `storage/global_index.py:173` - - Return: `CompletionItem[]` - -- [ ] **`workspace/symbol`** handler - - Source: `ChainSearchEngine.search_symbols()` - - Reference: `search/chain_search.py:618` - - Return: `SymbolInformation[]` - -### 1.3 Wire File Watcher to LSP Events -- [ ] **`workspace/didChangeWatchedFiles`** handler - - Delegate to: `WatcherManager.process_changes()` - - Reference: `watcher/manager.py:53` - -- [ ] **`textDocument/didSave`** handler - - Trigger: `IncrementalIndexer` for single file - - Reference: `watcher/incremental_indexer.py` - -### 1.4 Deliverables -- [ ] Unit tests for LSP handlers -- [ ] Integration test: definition lookup -- [ ] Integration test: completion prefix search -- [ ] Benchmark: query latency < 50ms - ---- - -## Phase 2: Find References Implementation (Priority: MEDIUM) - -### 2.1 Create `search_references` Method -- [ ] **Add to `src/codexlens/search/chain_search.py`** - ```python - def search_references( - self, - symbol_name: str, - source_path: Path, - depth: int = -1 - ) -> List[ReferenceResult]: - """Find all references to a symbol across the project.""" - ``` - -### 2.2 Implement Parallel Query Orchestration -- [ ] **Collect index paths** - - Use: `_collect_index_paths()` existing method - -- [ ] **Parallel query execution** - - ThreadPoolExecutor across all `_index.db` - - SQL: `SELECT * FROM code_relationships WHERE target_qualified_name = ?` - - Reference: `storage/sqlite_store.py:348` - -- [ ] **Result aggregation** - - Deduplicate by file:line - - Sort by file path, then line number - -### 2.3 LSP Handler -- [ ] **`textDocument/references`** handler - - Call: `ChainSearchEngine.search_references()` - - Return: `Location[]` - -### 2.4 Deliverables -- [ ] Unit test: single-index reference lookup -- [ ] Integration test: cross-directory references -- [ ] Benchmark: < 200ms for 10+ index files - ---- - -## Phase 3: Enhanced Hover Information (Priority: MEDIUM) - -### 3.1 Implement Hover Data Extraction -- [ ] **Create `src/codexlens/lsp/hover_provider.py`** - ```python - class HoverProvider: - def get_hover_info(self, symbol: Symbol) -> HoverInfo: - """Extract hover information for a symbol.""" - ``` - -### 3.2 Data Sources -- [ ] **Symbol metadata** - - Source: `GlobalSymbolIndex.search()` - - Fields: `kind`, `name`, `file_path`, `range` - -- [ ] **Source code extraction** - - Source: `SQLiteStore.files` table - - Reference: `storage/sqlite_store.py:284` - - Extract: Lines from `range[0]` to `range[1]` - -### 3.3 LSP Handler -- [ ] **`textDocument/hover`** handler - - Return: `Hover(contents=MarkupContent)` - - Format: Markdown with code fence - -### 3.4 Deliverables -- [ ] Unit test: hover for function/class/variable -- [ ] Integration test: multi-line function signature - ---- - -## Phase 4: MCP Bridge for Claude Code (Priority: HIGH VALUE) - -### 4.1 Define MCP Schema -- [ ] **Create `src/codexlens/mcp/__init__.py`** -- [ ] **Create `src/codexlens/mcp/schema.py`** - ```python - @dataclass - class MCPContext: - version: str = "1.0" - context_type: str - symbol: Optional[SymbolInfo] - definition: Optional[str] - references: List[ReferenceInfo] - related_symbols: List[SymbolInfo] - ``` - -### 4.2 Create MCP Provider -- [ ] **Create `src/codexlens/mcp/provider.py`** - ```python - class MCPProvider: - def build_context( - self, - symbol_name: str, - context_type: str = "symbol_explanation" - ) -> MCPContext: - """Build structured context for LLM consumption.""" - ``` - -### 4.3 Context Building Logic -- [ ] **Symbol lookup** - - Use: `GlobalSymbolIndex.search()` - -- [ ] **Definition extraction** - - Use: `SQLiteStore` file content - -- [ ] **References collection** - - Use: `ChainSearchEngine.search_references()` - -- [ ] **Related symbols** - - Use: `code_relationships` for imports/calls - -### 4.4 Hook Integration Points -- [ ] **Document `pre-tool` hook interface** - ```python - def pre_tool_hook(action: str, params: dict) -> MCPContext: - """Called before LLM action to gather context.""" - ``` - -- [ ] **Document `post-tool` hook interface** - ```python - def post_tool_hook(action: str, result: Any) -> None: - """Called after LSP action for proactive caching.""" - ``` - -### 4.5 Deliverables -- [ ] MCP schema JSON documentation -- [ ] Unit test: context building -- [ ] Integration test: hook → MCP → JSON output - ---- - -## Phase 5: Advanced Features (Priority: LOW) - -### 5.1 Custom LSP Commands -- [ ] **`codexlens/hybridSearch`** - - Expose: `HybridSearchEngine.search()` - - Reference: `search/hybrid_search.py` - -- [ ] **`codexlens/symbolGraph`** - - Return: Symbol relationship graph - - Source: `code_relationships` table - -### 5.2 Proactive Context Caching -- [ ] **Implement `post-tool` hook caching** - - After `go-to-definition`: pre-fetch references - - Cache TTL: 5 minutes - - Storage: In-memory LRU - -### 5.3 Performance Optimizations -- [ ] **Connection pooling** - - Reference: `storage/sqlite_store.py` thread-local - -- [ ] **Result caching** - - LRU cache for frequent queries - - Invalidate on file change - ---- - -## File Structure After Implementation - -``` -src/codexlens/ -├── lsp/ # NEW -│ ├── __init__.py -│ ├── server.py # Main LSP server -│ ├── handlers.py # LSP request handlers -│ ├── hover_provider.py # Hover information -│ └── utils.py # LSP utilities -│ -├── mcp/ # NEW -│ ├── __init__.py -│ ├── schema.py # MCP data models -│ ├── provider.py # Context builder -│ └── hooks.py # Hook interfaces -│ -├── search/ -│ ├── chain_search.py # MODIFY: add search_references() -│ └── ... -│ -└── ... -``` - ---- - -## Dependencies to Add - -```toml -# pyproject.toml -[project.optional-dependencies] -lsp = [ - "pygls>=1.3.0", -] -``` - ---- - -## Testing Strategy - -### Unit Tests -``` -tests/ -├── lsp/ -│ ├── test_definition.py -│ ├── test_completion.py -│ ├── test_references.py -│ └── test_hover.py -│ -└── mcp/ - ├── test_schema.py - └── test_provider.py -``` - -### Integration Tests -- [ ] Full LSP handshake test -- [ ] Multi-file project navigation -- [ ] Incremental index update via didSave - -### Performance Benchmarks -| Operation | Target | Acceptable | -|-----------|--------|------------| -| Definition lookup | < 30ms | < 50ms | -| Completion (100 items) | < 50ms | < 100ms | -| Find references (10 files) | < 150ms | < 200ms | -| Initial indexing (1000 files) | < 60s | < 120s | - ---- - -## Execution Order - -``` -Week 1: Phase 1.1 → 1.2 → 1.3 → 1.4 -Week 2: Phase 2.1 → 2.2 → 2.3 → 2.4 -Week 3: Phase 3 + Phase 4.1 → 4.2 -Week 4: Phase 4.3 → 4.4 → 4.5 -Week 5: Phase 5 (optional) + Polish -``` - ---- - -## Quick Start Commands - -```bash -# Install LSP dependencies -pip install pygls - -# Run LSP server (after implementation) -python -m codexlens.lsp --stdio - -# Test LSP connection -echo '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}' | python -m codexlens.lsp --stdio -``` - ---- - -## Reference Links - -- pygls Documentation: https://pygls.readthedocs.io/ -- LSP Specification: https://microsoft.github.io/language-server-protocol/ -- codex-lens GlobalSymbolIndex: `storage/global_index.py:173` -- codex-lens ChainSearchEngine: `search/chain_search.py:618` -- codex-lens WatcherManager: `watcher/manager.py:53` diff --git a/codex-lens/docs/LSP_INTEGRATION_PLAN.md b/codex-lens/docs/LSP_INTEGRATION_PLAN.md deleted file mode 100644 index 764ec3cd..00000000 --- a/codex-lens/docs/LSP_INTEGRATION_PLAN.md +++ /dev/null @@ -1,2588 +0,0 @@ -# codex-lens LSP Integration - Complete Execution Plan - -> Version: 1.0 -> Created: 2026-01-15 -> Based on: Gemini Multi-Round Deep Analysis -> Status: Ready for Execution - ---- - -## Table of Contents - -1. [Executive Summary](#1-executive-summary) -2. [Claude Code LSP Implementation Reference](#2-claude-code-lsp-implementation-reference) -3. [Architecture Overview](#3-architecture-overview) -4. [Phase 1: LSP Server Foundation](#4-phase-1-lsp-server-foundation) -5. [Phase 2: Find References](#5-phase-2-find-references) -6. [Phase 3: Hover Information](#6-phase-3-hover-information) -7. [Phase 4: MCP Bridge](#7-phase-4-mcp-bridge) -8. [Phase 5: Advanced Features](#8-phase-5-advanced-features) -9. [Testing Strategy](#9-testing-strategy) -10. [Deployment Guide](#10-deployment-guide) -11. [Risk Mitigation](#11-risk-mitigation) - ---- - -## 1. Executive Summary - -### 1.1 Project Goal - -将 codex-lens 的代码索引和搜索能力通过 LSP (Language Server Protocol) 暴露,使其能够: -- 为 IDE/编辑器提供代码导航功能 -- 与 Claude Code 的 hook 系统集成 -- 通过 MCP (Model Context Protocol) 为 LLM 提供结构化代码上下文 - -### 1.2 Value Proposition - -| Capability | Before | After | -|------------|--------|-------| -| Code Navigation | CLI only | IDE integration via LSP | -| Context for LLM | Manual copy-paste | Automated MCP injection | -| Real-time Updates | Batch re-index | Incremental on save | -| Cross-project Search | Per-directory | Unified global index | - -### 1.3 Success Criteria - -- [ ] All 5 core LSP methods implemented and tested -- [ ] Query latency < 100ms for 95th percentile -- [ ] MCP context generation working with Claude Code hooks -- [ ] Documentation and examples complete - ---- - -## 2. Claude Code LSP Implementation Reference - -> 本章节记录 Claude Code 当前 LSP 实现方式,作为 codex-lens 集成的技术参考。 - -### 2.1 Claude Code LSP 实现方式概览 - -Claude Code 实现 LSP 功能有 **三种方式**: - -| 方式 | 描述 | 适用场景 | -|------|------|----------| -| **内置 LSP 工具** | v2.0.74+ 原生支持 | 快速启用,基础功能 | -| **MCP Server (cclsp)** | 第三方 MCP 桥接 | 高级功能,位置容错 | -| **Plugin Marketplace** | 插件市场安装 | 多语言扩展支持 | - -### 2.2 方式一:内置 LSP 工具 (v2.0.74+) - -Claude Code 从 v2.0.74 版本开始内置 LSP 支持。 - -#### 启用方式 - -```bash -# 设置环境变量启用 LSP -export ENABLE_LSP_TOOL=1 -claude - -# 永久启用 (添加到 shell 配置) -echo 'export ENABLE_LSP_TOOL=1' >> ~/.bashrc -``` - -#### 内置 LSP 工具清单 - -| 工具名 | 功能 | 对应 LSP 方法 | 性能 | -|--------|------|---------------|------| -| `goToDefinition` | 跳转到符号定义 | `textDocument/definition` | ~50ms | -| `findReferences` | 查找所有引用 | `textDocument/references` | ~100ms | -| `documentSymbol` | 获取文件符号结构 | `textDocument/documentSymbol` | ~30ms | -| `hover` | 显示类型签名和文档 | `textDocument/hover` | ~50ms | -| `getDiagnostics` | 获取诊断信息 | `textDocument/diagnostic` | ~100ms | - -#### 性能对比 - -``` -传统文本搜索: ~45,000ms (45秒) -LSP 语义搜索: ~50ms -性能提升: 约 900 倍 -``` - -#### 当前限制 - -- 部分语言返回 "No LSP server available" -- 需要额外安装语言服务器插件 -- 不支持重命名等高级操作 - -### 2.3 方式二:MCP Server 方式 (cclsp) - -[cclsp](https://github.com/ktnyt/cclsp) 是一个 MCP Server,将 LSP 能力暴露给 Claude Code。 - -#### 架构图 - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ Claude Code │ -│ (MCP Client) │ -└───────────────────────────┬─────────────────────────────────────┘ - │ - │ MCP Protocol (JSON-RPC over stdio) - │ -┌───────────────────────────▼─────────────────────────────────────┐ -│ cclsp │ -│ (MCP Server) │ -│ ┌─────────────────────────────────────────────────────────┐ │ -│ │ Position Tolerance Layer │ │ -│ │ (自动尝试多个位置组合,解决 AI 行号不精确问题) │ │ -│ └─────────────────────────────────────────────────────────┘ │ -│ │ │ -│ ┌──────────────────┼──────────────────┐ │ -│ │ │ │ │ -│ ▼ ▼ ▼ │ -│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ -│ │ pylsp │ │ gopls │ │rust-analyzer│ │ -│ │ (Python) │ │ (Go) │ │ (Rust) │ │ -│ └─────────────┘ └─────────────┘ └─────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ -``` - -#### 安装与配置 - -```bash -# 一次性运行 (无需安装) -npx cclsp@latest setup - -# 用户级配置 -npx cclsp@latest setup --user -``` - -#### 配置文件格式 - -**位置**: `.claude/cclsp.json` 或 `~/.config/claude/cclsp.json` - -```json -{ - "servers": [ - { - "extensions": ["py", "pyi"], - "command": ["pylsp"], - "rootDir": ".", - "restartInterval": 5, - "initializationOptions": {} - }, - { - "extensions": ["ts", "tsx", "js", "jsx"], - "command": ["typescript-language-server", "--stdio"], - "rootDir": "." - }, - { - "extensions": ["go"], - "command": ["gopls"], - "rootDir": "." - }, - { - "extensions": ["rs"], - "command": ["rust-analyzer"], - "rootDir": "." - } - ] -} -``` - -#### cclsp 暴露的 MCP 工具 - -| MCP 工具 | 功能 | 特性 | -|----------|------|------| -| `find_definition` | 按名称和类型查找定义 | 支持模糊匹配 | -| `find_references` | 查找所有引用位置 | 跨文件搜索 | -| `rename_symbol` | 重命名符号 | 创建 .bak 备份 | -| `rename_symbol_strict` | 精确位置重命名 | 处理同名歧义 | -| `get_diagnostics` | 获取诊断信息 | 错误/警告/提示 | -| `restart_server` | 重启 LSP 服务器 | 解决内存泄漏 | - -#### 核心特性:位置容错 - -```python -# AI 生成的代码位置常有偏差 -# cclsp 自动尝试多个位置组合 - -positions_to_try = [ - (line, column), # 原始位置 - (line - 1, column), # 上一行 - (line + 1, column), # 下一行 - (line, 0), # 行首 - (line, len(line_content)) # 行尾 -] - -for pos in positions_to_try: - result = lsp_server.definition(pos) - if result: - return result -``` - -#### 支持的语言服务器 - -| 语言 | 服务器 | 安装命令 | -|------|--------|----------| -| Python | pylsp | `pip install python-lsp-server` | -| TypeScript | typescript-language-server | `npm i -g typescript-language-server` | -| Go | gopls | `go install golang.org/x/tools/gopls@latest` | -| Rust | rust-analyzer | `rustup component add rust-analyzer` | -| C/C++ | clangd | `apt install clangd` | -| Ruby | solargraph | `gem install solargraph` | -| PHP | intelephense | `npm i -g intelephense` | -| Java | jdtls | Eclipse JDT Language Server | - -### 2.4 方式三:Plugin Marketplace 插件 - -Claude Code 官方插件市场提供语言支持扩展。 - -#### 添加插件市场 - -```bash -/plugin marketplace add boostvolt/claude-code-lsps -``` - -#### 安装语言支持 - -```bash -# Python (Pyright) -/plugin install pyright@claude-code-lsps - -# TypeScript/JavaScript -/plugin install vtsls@claude-code-lsps - -# Go -/plugin install gopls@claude-code-lsps - -# Rust -/plugin install rust-analyzer@claude-code-lsps - -# Java -/plugin install jdtls@claude-code-lsps - -# C/C++ -/plugin install clangd@claude-code-lsps - -# C# -/plugin install omnisharp@claude-code-lsps - -# PHP -/plugin install intelephense@claude-code-lsps - -# Kotlin -/plugin install kotlin-language-server@claude-code-lsps - -# Ruby -/plugin install solargraph@claude-code-lsps -``` - -#### 支持的 11 种语言 - -Python, TypeScript, Go, Rust, Java, C/C++, C#, PHP, Kotlin, Ruby, HTML/CSS - -### 2.5 三种方式对比 - -| 特性 | 内置 LSP | cclsp (MCP) | Plugin Marketplace | -|------|----------|-------------|-------------------| -| 安装复杂度 | 低 (环境变量) | 中 (npx) | 低 (/plugin) | -| 功能完整性 | 基础 5 个操作 | 完整 + 重命名 | 完整 | -| 位置容错 | 无 | 有 | 无 | -| 重命名支持 | 无 | 有 | 有 | -| 自定义配置 | 无 | 完整 JSON | 有限 | -| 多语言支持 | 需插件 | 任意 LSP | 11 种 | -| 生产稳定性 | 高 | 中 | 高 | - -### 2.6 codex-lens 集成策略 - -基于 Claude Code LSP 实现方式分析,推荐以下集成策略: - -#### 策略 A:作为 MCP Server (推荐) - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ Claude Code │ -└───────────────────────────┬─────────────────────────────────────┘ - │ MCP Protocol - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ codex-lens MCP Server │ -│ ┌─────────────────────────────────────────────────────────┐ │ -│ │ MCP Tools │ │ -│ │ • find_definition → GlobalSymbolIndex.search() │ │ -│ │ • find_references → ChainSearchEngine.search_refs() │ │ -│ │ • get_context → MCPProvider.build_context() │ │ -│ │ • hybrid_search → HybridSearchEngine.search() │ │ -│ └─────────────────────────────────────────────────────────┘ │ -│ │ │ -│ ┌─────────────────────────▼───────────────────────────────┐ │ -│ │ codex-lens Core │ │ -│ │ GlobalSymbolIndex │ SQLiteStore │ WatcherManager │ │ -│ └─────────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ -``` - -**优势**: -- 直接复用 codex-lens 索引 -- 无需启动额外 LSP 进程 -- 支持 MCP 上下文注入 - -**实现文件**: `src/codexlens/mcp/server.py` - -```python -"""codex-lens MCP Server for Claude Code integration.""" - -import json -import sys -from typing import Any, Dict - -from codexlens.mcp.provider import MCPProvider -from codexlens.search.chain_search import ChainSearchEngine -from codexlens.storage.global_index import GlobalSymbolIndex - - -class CodexLensMCPServer: - """MCP Server exposing codex-lens capabilities.""" - - def __init__(self, workspace_path: str): - self.global_index = GlobalSymbolIndex(workspace_path) - self.search_engine = ChainSearchEngine(...) - self.mcp_provider = MCPProvider(...) - - def handle_request(self, request: Dict[str, Any]) -> Dict[str, Any]: - """Handle MCP tool call.""" - method = request.get("method") - params = request.get("params", {}) - - handlers = { - "find_definition": self._find_definition, - "find_references": self._find_references, - "get_context": self._get_context, - "hybrid_search": self._hybrid_search, - } - - handler = handlers.get(method) - if handler: - return handler(params) - return {"error": f"Unknown method: {method}"} - - def _find_definition(self, params: Dict) -> Dict: - """Find symbol definition.""" - symbol_name = params.get("symbol") - symbols = self.global_index.search(symbol_name, exact=True, limit=1) - if symbols: - s = symbols[0] - return { - "file": s.file_path, - "line": s.range[0], - "column": 0, - "kind": s.kind, - } - return {"error": "Symbol not found"} - - def _find_references(self, params: Dict) -> Dict: - """Find all references.""" - symbol_name = params.get("symbol") - refs = self.search_engine.search_references(symbol_name) - return { - "references": [ - {"file": r.file_path, "line": r.line, "context": r.context} - for r in refs - ] - } - - def _get_context(self, params: Dict) -> Dict: - """Get MCP context for LLM.""" - symbol_name = params.get("symbol") - context = self.mcp_provider.build_context(symbol_name) - return context.to_dict() if context else {"error": "Context not found"} - - def _hybrid_search(self, params: Dict) -> Dict: - """Execute hybrid search.""" - query = params.get("query") - # ... implementation -``` - -#### 策略 B:作为独立 LSP Server - -通过 cclsp 配置接入 codex-lens LSP Server。 - -**cclsp 配置** (`.claude/cclsp.json`): - -```json -{ - "servers": [ - { - "extensions": ["py", "ts", "go", "rs", "java"], - "command": ["codexlens-lsp", "--stdio"], - "rootDir": ".", - "restartInterval": 0 - } - ] -} -``` - -**优势**: -- 兼容标准 LSP 协议 -- 可被任意 LSP 客户端使用 -- cclsp 提供位置容错 - -#### 策略 C:混合模式 (最佳实践) - -``` -┌───────────────────────────────────────────────────────────────────┐ -│ Claude Code │ -│ ┌──────────────────┐ ┌──────────────────────────┐ │ -│ │ 内置 LSP 工具 │ │ MCP Client │ │ -│ │ (基础导航) │ │ (上下文注入) │ │ -│ └────────┬─────────┘ └────────────┬─────────────┘ │ -└───────────┼───────────────────────────────────┼──────────────────┘ - │ │ - │ LSP Protocol │ MCP Protocol - │ │ -┌───────────▼───────────────────────────────────▼──────────────────┐ -│ codex-lens Unified Server │ -│ ┌─────────────────────────┐ ┌─────────────────────────────┐ │ -│ │ LSP Handlers │ │ MCP Handlers │ │ -│ │ • definition │ │ • get_context │ │ -│ │ • references │ │ • enrich_prompt │ │ -│ │ • hover │ │ • hybrid_search │ │ -│ │ • completion │ │ • semantic_query │ │ -│ └────────────┬────────────┘ └──────────────┬──────────────┘ │ -│ │ │ │ -│ └───────────────┬───────────────┘ │ -│ ▼ │ -│ ┌─────────────────────────────────────────────────────────────┐ │ -│ │ codex-lens Core │ │ -│ │ GlobalSymbolIndex │ HybridSearch │ VectorStore │ Watcher │ │ -│ └─────────────────────────────────────────────────────────────┘ │ -└──────────────────────────────────────────────────────────────────┘ -``` - -**优势**: -- LSP 提供标准代码导航 -- MCP 提供 LLM 上下文增强 -- 统一索引,避免重复计算 - -### 2.7 参考资源 - -| 资源 | 链接 | -|------|------| -| Claude Code LSP 设置指南 | https://www.aifreeapi.com/en/posts/claude-code-lsp | -| cclsp GitHub | https://github.com/ktnyt/cclsp | -| Claude Code Plugins | https://code.claude.com/docs/en/plugins-reference | -| claude-code-lsps 市场 | https://github.com/Piebald-AI/claude-code-lsps | -| LSP 规范 | https://microsoft.github.io/language-server-protocol/ | -| MCP 规范 | https://modelcontextprotocol.io/ | - ---- - -## 3. Architecture Overview - -### 3.1 Target Architecture - -``` -┌─────────────────────────────────────────────────────────────────────────────┐ -│ Client Layer │ -├─────────────────────────────────────────────────────────────────────────────┤ -│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │ -│ │ VS Code │ │ Neovim │ │ Sublime │ │ Claude Code │ │ -│ │ (LSP Client)│ │ (LSP Client)│ │ (LSP Client)│ │ (Hook + MCP Client) │ │ -│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ └──────────┬──────────┘ │ -│ │ │ │ │ │ -└─────────┼────────────────┼────────────────┼─────────────────────┼───────────┘ - │ │ │ │ - └────────────────┴────────────────┴──────────┬──────────┘ - │ - (JSON-RPC / stdio) - │ -┌──────────────────────────────────────────────────────┴──────────────────────┐ -│ codex-lens LSP Server │ -├─────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ ┌─────────────────────────────────────────────────────────────────────┐ │ -│ │ LSP Layer (NEW) │ │ -│ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ -│ │ │ Handlers │ │ Providers │ │ Protocol │ │ │ -│ │ │ definition │ │ hover │ │ messages │ │ │ -│ │ │ references │ │ completion │ │ lifecycle │ │ │ -│ │ │ symbols │ │ │ │ │ │ │ -│ │ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │ │ -│ └─────────┼─────────────────┼─────────────────┼───────────────────────┘ │ -│ │ │ │ │ -│ ┌─────────┴─────────────────┴─────────────────┴───────────────────────┐ │ -│ │ MCP Layer (NEW) │ │ -│ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ -│ │ │ Schema │ │ Provider │ │ Hooks │ │ │ -│ │ │ MCPContext │ │ buildContext │ │ pre-tool │ │ │ -│ │ │ SymbolInfo │ │ enrichPrompt │ │ post-tool │ │ │ -│ │ └──────────────┘ └──────────────┘ └──────────────┘ │ │ -│ └─────────────────────────────────────────────────────────────────────┘ │ -│ │ │ -│ ┌─────────────────────────────────┴───────────────────────────────────┐ │ -│ │ Existing codex-lens Core │ │ -│ │ │ │ -│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌────────────┐ │ │ -│ │ │ Search │ │ Storage │ │ Watcher │ │ Parser │ │ │ -│ │ │ ChainSearch │ │ GlobalIndex │ │ Manager │ │ TreeSitter │ │ │ -│ │ │ HybridSearch│ │ SQLiteStore │ │ Incremental │ │ Symbols │ │ │ -│ │ └─────────────┘ └─────────────┘ └─────────────┘ └────────────┘ │ │ -│ └─────────────────────────────────────────────────────────────────────┘ │ -│ │ -└─────────────────────────────────────────────────────────────────────────────┘ -``` - -### 3.2 Data Flow - -``` - LSP Request Flow - ================ - -[Client] ─── textDocument/definition ───> [LSP Server] - │ - v - ┌─────────────────┐ - │ Parse Request │ - │ Extract symbol │ - └────────┬────────┘ - │ - v - ┌─────────────────┐ - │ GlobalSymbolIdx │ - │ .search() │ - └────────┬────────┘ - │ - v - ┌─────────────────┐ - │ Format Result │ - │ as Location │ - └────────┬────────┘ - │ -[Client] <─── Location Response ────────────────┘ - - - MCP Context Flow - ================ - -[Claude Code] ─── pre-tool hook ───> [MCP Provider] - │ - ┌─────────────────────┴─────────────────────┐ - v v v - ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ - │ Definition │ │ References │ │ Related │ - │ Lookup │ │ Lookup │ │ Symbols │ - └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ - │ │ │ - └─────────────────────┴─────────────────────┘ - │ - v - ┌───────────────┐ - │ MCPContext │ - │ Object │ - └───────┬───────┘ - │ -[Claude Code] <─── JSON Context ──────────┘ - │ - v - ┌───────────────────────┐ - │ Inject into LLM Prompt│ - └───────────────────────┘ -``` - -### 3.3 Module Dependencies - -``` - ┌─────────────────────┐ - │ lsp/server.py │ - │ (Entry Point) │ - └──────────┬──────────┘ - │ - ┌───────────────────┼───────────────────┐ - │ │ │ - v v v - ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ - │lsp/handlers │ │lsp/providers│ │ mcp/provider│ - │ .py │ │ .py │ │ .py │ - └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ - │ │ │ - └───────────────────┼───────────────────┘ - │ - v - ┌─────────────────────┐ - │ search/chain_search │ - │ .py │ - └──────────┬──────────┘ - │ - ┌───────────────────┼───────────────────┐ - │ │ │ - v v v - ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ - │storage/ │ │storage/ │ │watcher/ │ - │global_index │ │sqlite_store │ │manager.py │ - └─────────────┘ └─────────────┘ └─────────────┘ -``` - ---- - -## 4. Phase 1: LSP Server Foundation - -### 4.1 Overview - -| Attribute | Value | -|-----------|-------| -| Priority | HIGH | -| Complexity | Medium | -| Dependencies | pygls library | -| Deliverables | Working LSP server with 3 core handlers | - -### 4.2 Task Breakdown - -#### Task 1.1: Project Setup - -**File**: `pyproject.toml` (MODIFY) - -```toml -[project.optional-dependencies] -lsp = [ - "pygls>=1.3.0", -] - -[project.scripts] -codexlens-lsp = "codexlens.lsp:main" -``` - -**Acceptance Criteria**: -- [ ] `pip install -e ".[lsp]"` succeeds -- [ ] `codexlens-lsp --help` shows usage - ---- - -#### Task 1.2: LSP Server Core - -**File**: `src/codexlens/lsp/__init__.py` (NEW) - -```python -"""codex-lens Language Server Protocol implementation.""" - -from codexlens.lsp.server import CodexLensLanguageServer, main - -__all__ = ["CodexLensLanguageServer", "main"] -``` - -**File**: `src/codexlens/lsp/server.py` (NEW) - -```python -"""Main LSP server implementation using pygls.""" - -import logging -from pathlib import Path -from typing import Optional - -from lsprotocol import types as lsp -from pygls.server import LanguageServer - -from codexlens.search.chain_search import ChainSearchEngine -from codexlens.storage.global_index import GlobalSymbolIndex -from codexlens.storage.registry import RegistryStore -from codexlens.storage.path_mapper import PathMapper -from codexlens.watcher.manager import WatcherManager - -logger = logging.getLogger(__name__) - - -class CodexLensLanguageServer(LanguageServer): - """Language Server powered by codex-lens indexing.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.workspace_path: Optional[Path] = None - self.registry: Optional[RegistryStore] = None - self.search_engine: Optional[ChainSearchEngine] = None - self.global_index: Optional[GlobalSymbolIndex] = None - self.watcher: Optional[WatcherManager] = None - - def initialize_codexlens(self, workspace_path: Path) -> None: - """Initialize codex-lens components for the workspace.""" - self.workspace_path = workspace_path - - # Initialize registry and search engine - self.registry = RegistryStore() - self.registry.initialize() - - mapper = PathMapper() - self.search_engine = ChainSearchEngine(self.registry, mapper) - - # Initialize global symbol index - self.global_index = GlobalSymbolIndex(workspace_path) - - # Start file watcher for incremental updates - self.watcher = WatcherManager( - root_path=workspace_path, - on_indexed=self._on_file_indexed - ) - self.watcher.start() - - logger.info(f"Initialized codex-lens for workspace: {workspace_path}") - - def _on_file_indexed(self, file_path: Path) -> None: - """Callback when a file is indexed.""" - logger.debug(f"File indexed: {file_path}") - - def shutdown_codexlens(self) -> None: - """Cleanup codex-lens components.""" - if self.watcher: - self.watcher.stop() - self.watcher = None - logger.info("codex-lens shutdown complete") - - -# Create server instance -server = CodexLensLanguageServer( - name="codex-lens", - version="0.1.0" -) - - -@server.feature(lsp.INITIALIZE) -def on_initialize(params: lsp.InitializeParams) -> lsp.InitializeResult: - """Handle LSP initialize request.""" - if params.root_uri: - workspace_path = Path(params.root_uri.replace("file://", "")) - server.initialize_codexlens(workspace_path) - - return lsp.InitializeResult( - capabilities=lsp.ServerCapabilities( - text_document_sync=lsp.TextDocumentSyncOptions( - open_close=True, - change=lsp.TextDocumentSyncKind.Incremental, - save=lsp.SaveOptions(include_text=False), - ), - definition_provider=True, - references_provider=True, - completion_provider=lsp.CompletionOptions( - trigger_characters=[".", "_"], - ), - hover_provider=True, - workspace_symbol_provider=True, - ), - server_info=lsp.ServerInfo( - name="codex-lens", - version="0.1.0", - ), - ) - - -@server.feature(lsp.SHUTDOWN) -def on_shutdown(params: None) -> None: - """Handle LSP shutdown request.""" - server.shutdown_codexlens() - - -def main(): - """Entry point for the LSP server.""" - import argparse - - parser = argparse.ArgumentParser(description="codex-lens Language Server") - parser.add_argument("--stdio", action="store_true", help="Use stdio transport") - parser.add_argument("--tcp", action="store_true", help="Use TCP transport") - parser.add_argument("--host", default="127.0.0.1", help="TCP host") - parser.add_argument("--port", type=int, default=2087, help="TCP port") - - args = parser.parse_args() - - if args.tcp: - server.start_tcp(args.host, args.port) - else: - server.start_io() - - -if __name__ == "__main__": - main() -``` - -**Acceptance Criteria**: -- [ ] Server starts without errors -- [ ] Handles initialize/shutdown lifecycle -- [ ] WatcherManager starts on workspace open - ---- - -#### Task 1.3: Definition Handler - -**File**: `src/codexlens/lsp/handlers.py` (NEW) - -```python -"""LSP request handlers.""" - -import logging -from pathlib import Path -from typing import List, Optional, Union - -from lsprotocol import types as lsp - -from codexlens.lsp.server import server -from codexlens.entities import Symbol - -logger = logging.getLogger(__name__) - - -def symbol_to_location(symbol: Symbol) -> lsp.Location: - """Convert codex-lens Symbol to LSP Location.""" - return lsp.Location( - uri=f"file://{symbol.file_path}", - range=lsp.Range( - start=lsp.Position( - line=symbol.range[0] - 1, # LSP is 0-indexed - character=0, - ), - end=lsp.Position( - line=symbol.range[1] - 1, - character=0, - ), - ), - ) - - -@server.feature(lsp.TEXT_DOCUMENT_DEFINITION) -def on_definition( - params: lsp.DefinitionParams, -) -> Optional[Union[lsp.Location, List[lsp.Location]]]: - """Handle textDocument/definition request.""" - if not server.global_index: - return None - - # Get the word at cursor position - document = server.workspace.get_text_document(params.text_document.uri) - word = _get_word_at_position(document, params.position) - - if not word: - return None - - logger.debug(f"Definition lookup for: {word}") - - # Search in global symbol index - symbols = server.global_index.search(word, exact=True, limit=10) - - if not symbols: - return None - - if len(symbols) == 1: - return symbol_to_location(symbols[0]) - - return [symbol_to_location(s) for s in symbols] - - -def _get_word_at_position(document, position: lsp.Position) -> Optional[str]: - """Extract the word at the given position.""" - try: - lines = document.source.split("\n") - if position.line >= len(lines): - return None - - line = lines[position.line] - - # Find word boundaries - start = position.character - end = position.character - - # Expand left - while start > 0 and _is_identifier_char(line[start - 1]): - start -= 1 - - # Expand right - while end < len(line) and _is_identifier_char(line[end]): - end += 1 - - word = line[start:end] - return word if word else None - except Exception as e: - logger.error(f"Error extracting word: {e}") - return None - - -def _is_identifier_char(char: str) -> bool: - """Check if character is valid in an identifier.""" - return char.isalnum() or char == "_" -``` - -**Acceptance Criteria**: -- [ ] Returns Location for known symbols -- [ ] Returns None for unknown symbols -- [ ] Handles multiple definitions (overloads) - ---- - -#### Task 1.4: Completion Handler - -**File**: `src/codexlens/lsp/handlers.py` (APPEND) - -```python -@server.feature(lsp.TEXT_DOCUMENT_COMPLETION) -def on_completion( - params: lsp.CompletionParams, -) -> Optional[lsp.CompletionList]: - """Handle textDocument/completion request.""" - if not server.global_index: - return None - - # Get partial word at cursor - document = server.workspace.get_text_document(params.text_document.uri) - prefix = _get_prefix_at_position(document, params.position) - - if not prefix or len(prefix) < 2: - return None - - logger.debug(f"Completion lookup for prefix: {prefix}") - - # Search with prefix mode - symbols = server.global_index.search(prefix, prefix_mode=True, limit=50) - - if not symbols: - return None - - items = [] - for symbol in symbols: - kind = _symbol_kind_to_completion_kind(symbol.kind) - items.append( - lsp.CompletionItem( - label=symbol.name, - kind=kind, - detail=f"{symbol.kind} in {Path(symbol.file_path).name}", - documentation=lsp.MarkupContent( - kind=lsp.MarkupKind.Markdown, - value=f"Defined at line {symbol.range[0]}", - ), - ) - ) - - return lsp.CompletionList(is_incomplete=len(items) >= 50, items=items) - - -def _get_prefix_at_position(document, position: lsp.Position) -> Optional[str]: - """Extract the incomplete word prefix at position.""" - try: - lines = document.source.split("\n") - if position.line >= len(lines): - return None - - line = lines[position.line] - - # Find prefix start - start = position.character - while start > 0 and _is_identifier_char(line[start - 1]): - start -= 1 - - return line[start:position.character] if start < position.character else None - except Exception: - return None - - -def _symbol_kind_to_completion_kind(kind: str) -> lsp.CompletionItemKind: - """Map symbol kind to LSP completion kind.""" - mapping = { - "function": lsp.CompletionItemKind.Function, - "method": lsp.CompletionItemKind.Method, - "class": lsp.CompletionItemKind.Class, - "variable": lsp.CompletionItemKind.Variable, - "constant": lsp.CompletionItemKind.Constant, - "module": lsp.CompletionItemKind.Module, - "property": lsp.CompletionItemKind.Property, - "interface": lsp.CompletionItemKind.Interface, - "enum": lsp.CompletionItemKind.Enum, - } - return mapping.get(kind.lower(), lsp.CompletionItemKind.Text) -``` - -**Acceptance Criteria**: -- [ ] Returns completion items for valid prefixes -- [ ] Respects minimum prefix length (2 chars) -- [ ] Maps symbol kinds correctly - ---- - -#### Task 1.5: Workspace Symbol Handler - -**File**: `src/codexlens/lsp/handlers.py` (APPEND) - -```python -@server.feature(lsp.WORKSPACE_SYMBOL) -def on_workspace_symbol( - params: lsp.WorkspaceSymbolParams, -) -> Optional[List[lsp.SymbolInformation]]: - """Handle workspace/symbol request.""" - if not server.search_engine or not server.workspace_path: - return None - - query = params.query - if not query or len(query) < 2: - return None - - logger.debug(f"Workspace symbol search: {query}") - - # Use chain search engine's symbol search - result = server.search_engine.search_symbols( - query=query, - source_path=server.workspace_path, - limit=100, - ) - - if not result: - return None - - items = [] - for symbol in result: - kind = _symbol_kind_to_symbol_kind(symbol.kind) - items.append( - lsp.SymbolInformation( - name=symbol.name, - kind=kind, - location=symbol_to_location(symbol), - container_name=Path(symbol.file_path).parent.name, - ) - ) - - return items - - -def _symbol_kind_to_symbol_kind(kind: str) -> lsp.SymbolKind: - """Map symbol kind string to LSP SymbolKind.""" - mapping = { - "function": lsp.SymbolKind.Function, - "method": lsp.SymbolKind.Method, - "class": lsp.SymbolKind.Class, - "variable": lsp.SymbolKind.Variable, - "constant": lsp.SymbolKind.Constant, - "module": lsp.SymbolKind.Module, - "property": lsp.SymbolKind.Property, - "interface": lsp.SymbolKind.Interface, - "enum": lsp.SymbolKind.Enum, - "struct": lsp.SymbolKind.Struct, - "namespace": lsp.SymbolKind.Namespace, - } - return mapping.get(kind.lower(), lsp.SymbolKind.Variable) -``` - -**Acceptance Criteria**: -- [ ] Returns symbols matching query -- [ ] Respects result limit -- [ ] Includes container information - ---- - -#### Task 1.6: File Watcher Integration - -**File**: `src/codexlens/lsp/handlers.py` (APPEND) - -```python -@server.feature(lsp.TEXT_DOCUMENT_DID_SAVE) -def on_did_save(params: lsp.DidSaveTextDocumentParams) -> None: - """Handle textDocument/didSave notification.""" - if not server.watcher: - return - - file_path = Path(params.text_document.uri.replace("file://", "")) - logger.debug(f"File saved: {file_path}") - - # Trigger incremental indexing - server.watcher.trigger_index(file_path) - - -@server.feature(lsp.TEXT_DOCUMENT_DID_OPEN) -def on_did_open(params: lsp.DidOpenTextDocumentParams) -> None: - """Handle textDocument/didOpen notification.""" - logger.debug(f"File opened: {params.text_document.uri}") - - -@server.feature(lsp.TEXT_DOCUMENT_DID_CLOSE) -def on_did_close(params: lsp.DidCloseTextDocumentParams) -> None: - """Handle textDocument/didClose notification.""" - logger.debug(f"File closed: {params.text_document.uri}") -``` - -**Acceptance Criteria**: -- [ ] didSave triggers incremental index -- [ ] No blocking on save -- [ ] Proper logging - ---- - -### 4.3 Phase 1 Test Plan - -**File**: `tests/lsp/test_server.py` (NEW) - -```python -"""Tests for LSP server.""" - -import pytest -from pathlib import Path -from unittest.mock import Mock, patch - -from lsprotocol import types as lsp - -from codexlens.lsp.server import CodexLensLanguageServer, on_initialize - - -class TestServerInitialization: - """Test server lifecycle.""" - - def test_initialize_creates_components(self, tmp_path): - """Server creates all components on initialize.""" - server = CodexLensLanguageServer("test", "0.1.0") - - params = lsp.InitializeParams( - root_uri=f"file://{tmp_path}", - capabilities=lsp.ClientCapabilities(), - ) - - result = on_initialize(params) - - assert result.capabilities.definition_provider - assert result.capabilities.completion_provider - assert result.capabilities.workspace_symbol_provider - - -class TestDefinitionHandler: - """Test textDocument/definition handler.""" - - def test_definition_returns_location(self): - """Definition returns valid Location.""" - # Setup mock global index - mock_symbol = Mock() - mock_symbol.file_path = "/test/file.py" - mock_symbol.range = (10, 15) - - with patch.object(server, 'global_index') as mock_index: - mock_index.search.return_value = [mock_symbol] - - # Call handler - result = on_definition(Mock( - text_document=Mock(uri="file:///test/file.py"), - position=lsp.Position(line=5, character=10), - )) - - assert isinstance(result, lsp.Location) - assert result.uri == "file:///test/file.py" - - -class TestCompletionHandler: - """Test textDocument/completion handler.""" - - def test_completion_returns_items(self): - """Completion returns CompletionList.""" - # Test implementation - pass -``` - -**Acceptance Criteria**: -- [ ] All unit tests pass -- [ ] Coverage > 80% for LSP module -- [ ] Integration test with real workspace - ---- - -## 5. Phase 2: Find References - -### 5.1 Overview - -| Attribute | Value | -|-----------|-------| -| Priority | MEDIUM | -| Complexity | High | -| Dependencies | Phase 1 complete | -| Deliverables | `search_references()` method + LSP handler | - -### 5.2 Task Breakdown - -#### Task 2.1: Add `search_references` to ChainSearchEngine - -**File**: `src/codexlens/search/chain_search.py` (MODIFY) - -```python -# Add to ChainSearchEngine class - -from dataclasses import dataclass -from typing import List -from concurrent.futures import ThreadPoolExecutor, as_completed - - -@dataclass -class ReferenceResult: - """Result from reference search.""" - file_path: str - line: int - column: int - context: str # Surrounding code snippet - relationship_type: str # "call", "import", "inheritance", etc. - - -def search_references( - self, - symbol_name: str, - source_path: Optional[Path] = None, - depth: int = -1, - limit: int = 100, -) -> List[ReferenceResult]: - """Find all references to a symbol across the project. - - Args: - symbol_name: Fully qualified or simple name of the symbol - source_path: Starting path for search (default: workspace root) - depth: Search depth (-1 = unlimited) - limit: Maximum results to return - - Returns: - List of ReferenceResult objects sorted by file path and line - """ - source = source_path or self._workspace_path - - # Collect all index paths - index_paths = self._collect_index_paths(source, depth) - - if not index_paths: - logger.warning(f"No indexes found for reference search: {source}") - return [] - - # Parallel query across all indexes - all_results: List[ReferenceResult] = [] - - with ThreadPoolExecutor(max_workers=self._options.max_workers) as executor: - futures = { - executor.submit( - self._search_references_single, - idx_path, - symbol_name, - ): idx_path - for idx_path in index_paths - } - - for future in as_completed(futures): - try: - results = future.result(timeout=10) - all_results.extend(results) - except Exception as e: - logger.error(f"Reference search failed: {e}") - - # Sort and limit - all_results.sort(key=lambda r: (r.file_path, r.line)) - return all_results[:limit] - - -def _search_references_single( - self, - index_path: Path, - symbol_name: str, -) -> List[ReferenceResult]: - """Search for references in a single index.""" - results = [] - - try: - store = DirIndexStore(index_path.parent) - - # Query code_relationships table - query = """ - SELECT - cr.source_file, - cr.source_line, - cr.source_column, - cr.relationship_type, - f.content - FROM code_relationships cr - JOIN files f ON f.full_path = cr.source_file - WHERE cr.target_qualified_name LIKE ? - OR cr.target_name = ? - ORDER BY cr.source_file, cr.source_line - """ - - rows = store.execute_query( - query, - (f"%{symbol_name}", symbol_name), - ) - - for row in rows: - # Extract context (3 lines around reference) - content_lines = row["content"].split("\n") - line_idx = row["source_line"] - 1 - start = max(0, line_idx - 1) - end = min(len(content_lines), line_idx + 2) - context = "\n".join(content_lines[start:end]) - - results.append(ReferenceResult( - file_path=row["source_file"], - line=row["source_line"], - column=row["source_column"] or 0, - context=context, - relationship_type=row["relationship_type"], - )) - except Exception as e: - logger.error(f"Failed to search references in {index_path}: {e}") - - return results -``` - -**Acceptance Criteria**: -- [ ] Searches all index files in parallel -- [ ] Returns properly formatted ReferenceResult -- [ ] Handles missing indexes gracefully - ---- - -#### Task 2.2: LSP References Handler - -**File**: `src/codexlens/lsp/handlers.py` (APPEND) - -```python -@server.feature(lsp.TEXT_DOCUMENT_REFERENCES) -def on_references( - params: lsp.ReferenceParams, -) -> Optional[List[lsp.Location]]: - """Handle textDocument/references request.""" - if not server.search_engine or not server.workspace_path: - return None - - # Get the word at cursor - document = server.workspace.get_text_document(params.text_document.uri) - word = _get_word_at_position(document, params.position) - - if not word: - return None - - logger.debug(f"References lookup for: {word}") - - # Search for references - references = server.search_engine.search_references( - symbol_name=word, - source_path=server.workspace_path, - limit=200, - ) - - if not references: - return None - - # Convert to LSP Locations - locations = [] - for ref in references: - locations.append( - lsp.Location( - uri=f"file://{ref.file_path}", - range=lsp.Range( - start=lsp.Position(line=ref.line - 1, character=ref.column), - end=lsp.Position(line=ref.line - 1, character=ref.column + len(word)), - ), - ) - ) - - return locations -``` - -**Acceptance Criteria**: -- [ ] Returns all references across project -- [ ] Includes definition if `params.context.include_declaration` -- [ ] Performance < 200ms for typical project - ---- - -### 5.3 Phase 2 Test Plan - -```python -class TestReferencesSearch: - """Test reference search functionality.""" - - def test_finds_function_calls(self, indexed_project): - """Finds all calls to a function.""" - results = search_engine.search_references("my_function") - assert len(results) > 0 - assert all(r.relationship_type == "call" for r in results) - - def test_finds_imports(self, indexed_project): - """Finds all imports of a module.""" - results = search_engine.search_references("my_module") - assert any(r.relationship_type == "import" for r in results) - - def test_parallel_search_performance(self, large_project): - """Parallel search completes within time limit.""" - import time - start = time.time() - results = search_engine.search_references("common_symbol") - elapsed = time.time() - start - assert elapsed < 0.2 # 200ms -``` - ---- - -## 6. Phase 3: Hover Information - -### 6.1 Overview - -| Attribute | Value | -|-----------|-------| -| Priority | MEDIUM | -| Complexity | Low | -| Dependencies | Phase 1 complete | -| Deliverables | Hover provider + LSP handler | - -### 6.2 Task Breakdown - -#### Task 3.1: Hover Provider - -**File**: `src/codexlens/lsp/providers.py` (NEW) - -```python -"""LSP feature providers.""" - -import logging -from dataclasses import dataclass -from pathlib import Path -from typing import Optional - -from codexlens.entities import Symbol -from codexlens.storage.sqlite_store import SQLiteStore - -logger = logging.getLogger(__name__) - - -@dataclass -class HoverInfo: - """Hover information for a symbol.""" - name: str - kind: str - signature: str - documentation: Optional[str] - file_path: str - line_range: tuple - - -class HoverProvider: - """Provides hover information for symbols.""" - - def __init__(self, global_index, registry): - self.global_index = global_index - self.registry = registry - - def get_hover_info(self, symbol_name: str) -> Optional[HoverInfo]: - """Get hover information for a symbol. - - Args: - symbol_name: Name of the symbol to look up - - Returns: - HoverInfo or None if symbol not found - """ - # Look up symbol in global index - symbols = self.global_index.search(symbol_name, exact=True, limit=1) - - if not symbols: - return None - - symbol = symbols[0] - - # Extract signature from source - signature = self._extract_signature(symbol) - - return HoverInfo( - name=symbol.name, - kind=symbol.kind, - signature=signature, - documentation=symbol.docstring, - file_path=symbol.file_path, - line_range=symbol.range, - ) - - def _extract_signature(self, symbol: Symbol) -> str: - """Extract function/class signature from source.""" - try: - # Find the index for this file - index_path = self.registry.find_index_path( - Path(symbol.file_path).parent - ) - - if not index_path: - return f"{symbol.kind} {symbol.name}" - - store = SQLiteStore(index_path.parent) - - # Get file content - rows = store.execute_query( - "SELECT content FROM files WHERE full_path = ?", - (symbol.file_path,), - ) - - if not rows: - return f"{symbol.kind} {symbol.name}" - - content = rows[0]["content"] - lines = content.split("\n") - - # Extract signature lines - start_line = symbol.range[0] - 1 - signature_lines = [] - - # Get first line (def/class declaration) - if start_line < len(lines): - first_line = lines[start_line] - signature_lines.append(first_line) - - # Continue if line ends with backslash or doesn't have closing paren - i = start_line + 1 - while i < len(lines) and i < start_line + 5: - if "):" in signature_lines[-1] or ":" in signature_lines[-1]: - break - signature_lines.append(lines[i]) - i += 1 - - return "\n".join(signature_lines) - except Exception as e: - logger.error(f"Failed to extract signature: {e}") - return f"{symbol.kind} {symbol.name}" - - def format_hover_markdown(self, info: HoverInfo) -> str: - """Format hover info as Markdown.""" - parts = [] - - # Code block with signature - parts.append(f"```python\n{info.signature}\n```") - - # Documentation if available - if info.documentation: - parts.append(f"\n---\n\n{info.documentation}") - - # Location info - parts.append( - f"\n---\n\n*{info.kind}* defined in " - f"`{Path(info.file_path).name}` " - f"(line {info.line_range[0]})" - ) - - return "\n".join(parts) -``` - ---- - -#### Task 3.2: LSP Hover Handler - -**File**: `src/codexlens/lsp/handlers.py` (APPEND) - -```python -from codexlens.lsp.providers import HoverProvider - - -@server.feature(lsp.TEXT_DOCUMENT_HOVER) -def on_hover(params: lsp.HoverParams) -> Optional[lsp.Hover]: - """Handle textDocument/hover request.""" - if not server.global_index or not server.registry: - return None - - # Get word at cursor - document = server.workspace.get_text_document(params.text_document.uri) - word = _get_word_at_position(document, params.position) - - if not word: - return None - - logger.debug(f"Hover lookup for: {word}") - - # Get hover info - provider = HoverProvider(server.global_index, server.registry) - info = provider.get_hover_info(word) - - if not info: - return None - - # Format as markdown - content = provider.format_hover_markdown(info) - - return lsp.Hover( - contents=lsp.MarkupContent( - kind=lsp.MarkupKind.Markdown, - value=content, - ), - ) -``` - -**Acceptance Criteria**: -- [ ] Shows function signature -- [ ] Shows documentation if available -- [ ] Shows file location - ---- - -## 7. Phase 4: MCP Bridge - -### 7.1 Overview - -| Attribute | Value | -|-----------|-------| -| Priority | HIGH VALUE | -| Complexity | Medium | -| Dependencies | Phase 1-2 complete | -| Deliverables | MCP schema + provider + hook interfaces | - -### 7.2 Task Breakdown - -#### Task 4.1: MCP Schema Definition - -**File**: `src/codexlens/mcp/__init__.py` (NEW) - -```python -"""Model Context Protocol implementation for Claude Code integration.""" - -from codexlens.mcp.schema import ( - MCPContext, - SymbolInfo, - ReferenceInfo, - RelatedSymbol, -) -from codexlens.mcp.provider import MCPProvider - -__all__ = [ - "MCPContext", - "SymbolInfo", - "ReferenceInfo", - "RelatedSymbol", - "MCPProvider", -] -``` - -**File**: `src/codexlens/mcp/schema.py` (NEW) - -```python -"""MCP data models.""" - -from dataclasses import dataclass, field, asdict -from typing import List, Optional -import json - - -@dataclass -class SymbolInfo: - """Information about a code symbol.""" - name: str - kind: str - file_path: str - line_start: int - line_end: int - signature: Optional[str] = None - documentation: Optional[str] = None - - def to_dict(self) -> dict: - return asdict(self) - - -@dataclass -class ReferenceInfo: - """Information about a symbol reference.""" - file_path: str - line: int - column: int - context: str - relationship_type: str - - def to_dict(self) -> dict: - return asdict(self) - - -@dataclass -class RelatedSymbol: - """Related symbol (import, call target, etc.).""" - name: str - kind: str - relationship: str # "imports", "calls", "inherits", "uses" - file_path: Optional[str] = None - - def to_dict(self) -> dict: - return asdict(self) - - -@dataclass -class MCPContext: - """Model Context Protocol context object. - - This is the structured context that gets injected into - LLM prompts to provide code understanding. - """ - version: str = "1.0" - context_type: str = "code_context" - symbol: Optional[SymbolInfo] = None - definition: Optional[str] = None - references: List[ReferenceInfo] = field(default_factory=list) - related_symbols: List[RelatedSymbol] = field(default_factory=list) - metadata: dict = field(default_factory=dict) - - def to_dict(self) -> dict: - """Convert to dictionary for JSON serialization.""" - result = { - "version": self.version, - "context_type": self.context_type, - "metadata": self.metadata, - } - - if self.symbol: - result["symbol"] = self.symbol.to_dict() - - if self.definition: - result["definition"] = self.definition - - if self.references: - result["references"] = [r.to_dict() for r in self.references] - - if self.related_symbols: - result["related_symbols"] = [s.to_dict() for s in self.related_symbols] - - return result - - def to_json(self, indent: int = 2) -> str: - """Serialize to JSON string.""" - return json.dumps(self.to_dict(), indent=indent) - - def to_prompt_injection(self) -> str: - """Format for injection into LLM prompt.""" - parts = [""] - - if self.symbol: - parts.append(f"## Symbol: {self.symbol.name}") - parts.append(f"Type: {self.symbol.kind}") - parts.append(f"Location: {self.symbol.file_path}:{self.symbol.line_start}") - - if self.definition: - parts.append("\n## Definition") - parts.append(f"```\n{self.definition}\n```") - - if self.references: - parts.append(f"\n## References ({len(self.references)} found)") - for i, ref in enumerate(self.references[:5]): # Limit to 5 - parts.append(f"- {ref.file_path}:{ref.line} ({ref.relationship_type})") - parts.append(f" ```\n {ref.context}\n ```") - - if self.related_symbols: - parts.append("\n## Related Symbols") - for sym in self.related_symbols[:10]: # Limit to 10 - parts.append(f"- {sym.name} ({sym.relationship})") - - parts.append("") - - return "\n".join(parts) -``` - ---- - -#### Task 4.2: MCP Provider - -**File**: `src/codexlens/mcp/provider.py` (NEW) - -```python -"""MCP context provider.""" - -import logging -from pathlib import Path -from typing import Optional, List - -from codexlens.mcp.schema import ( - MCPContext, - SymbolInfo, - ReferenceInfo, - RelatedSymbol, -) -from codexlens.search.chain_search import ChainSearchEngine -from codexlens.storage.global_index import GlobalSymbolIndex -from codexlens.storage.sqlite_store import SQLiteStore -from codexlens.storage.registry import RegistryStore - -logger = logging.getLogger(__name__) - - -class MCPProvider: - """Builds MCP context objects from codex-lens data.""" - - def __init__( - self, - global_index: GlobalSymbolIndex, - search_engine: ChainSearchEngine, - registry: RegistryStore, - ): - self.global_index = global_index - self.search_engine = search_engine - self.registry = registry - - def build_context( - self, - symbol_name: str, - context_type: str = "symbol_explanation", - include_references: bool = True, - include_related: bool = True, - max_references: int = 10, - ) -> Optional[MCPContext]: - """Build comprehensive context for a symbol. - - Args: - symbol_name: Name of the symbol to contextualize - context_type: Type of context being requested - include_references: Whether to include reference locations - include_related: Whether to include related symbols - max_references: Maximum number of references to include - - Returns: - MCPContext object or None if symbol not found - """ - # Look up symbol - symbols = self.global_index.search(symbol_name, exact=True, limit=1) - - if not symbols: - logger.warning(f"Symbol not found for MCP context: {symbol_name}") - return None - - symbol = symbols[0] - - # Build SymbolInfo - symbol_info = SymbolInfo( - name=symbol.name, - kind=symbol.kind, - file_path=symbol.file_path, - line_start=symbol.range[0], - line_end=symbol.range[1], - signature=getattr(symbol, 'signature', None), - documentation=getattr(symbol, 'docstring', None), - ) - - # Extract definition source code - definition = self._extract_definition(symbol) - - # Get references - references = [] - if include_references: - refs = self.search_engine.search_references( - symbol_name, - limit=max_references, - ) - references = [ - ReferenceInfo( - file_path=r.file_path, - line=r.line, - column=r.column, - context=r.context, - relationship_type=r.relationship_type, - ) - for r in refs - ] - - # Get related symbols - related_symbols = [] - if include_related: - related_symbols = self._get_related_symbols(symbol) - - return MCPContext( - context_type=context_type, - symbol=symbol_info, - definition=definition, - references=references, - related_symbols=related_symbols, - metadata={ - "source": "codex-lens", - "indexed_at": symbol.indexed_at if hasattr(symbol, 'indexed_at') else None, - }, - ) - - def _extract_definition(self, symbol) -> Optional[str]: - """Extract source code for symbol definition.""" - try: - index_path = self.registry.find_index_path( - Path(symbol.file_path).parent - ) - - if not index_path: - return None - - store = SQLiteStore(index_path.parent) - rows = store.execute_query( - "SELECT content FROM files WHERE full_path = ?", - (symbol.file_path,), - ) - - if not rows: - return None - - content = rows[0]["content"] - lines = content.split("\n") - - # Extract symbol lines - start = symbol.range[0] - 1 - end = symbol.range[1] - - return "\n".join(lines[start:end]) - except Exception as e: - logger.error(f"Failed to extract definition: {e}") - return None - - def _get_related_symbols(self, symbol) -> List[RelatedSymbol]: - """Get symbols related to the given symbol.""" - related = [] - - try: - index_path = self.registry.find_index_path( - Path(symbol.file_path).parent - ) - - if not index_path: - return related - - store = SQLiteStore(index_path.parent) - - # Query relationships where this symbol is the source - rows = store.execute_query( - """ - SELECT target_name, target_qualified_name, relationship_type - FROM code_relationships - WHERE source_qualified_name LIKE ? - LIMIT 20 - """, - (f"%{symbol.name}%",), - ) - - for row in rows: - related.append(RelatedSymbol( - name=row["target_name"], - kind="unknown", # Would need another lookup - relationship=row["relationship_type"], - )) - except Exception as e: - logger.error(f"Failed to get related symbols: {e}") - - return related - - def build_context_for_file( - self, - file_path: Path, - context_type: str = "file_overview", - ) -> MCPContext: - """Build context for an entire file.""" - # Get all symbols in file - symbols = self.global_index.search_by_file(str(file_path)) - - related = [ - RelatedSymbol( - name=s.name, - kind=s.kind, - relationship="defines", - ) - for s in symbols - ] - - return MCPContext( - context_type=context_type, - related_symbols=related, - metadata={ - "file_path": str(file_path), - "symbol_count": len(symbols), - }, - ) -``` - ---- - -#### Task 4.3: Hook Interfaces - -**File**: `src/codexlens/mcp/hooks.py` (NEW) - -```python -"""Hook interfaces for Claude Code integration.""" - -import logging -from pathlib import Path -from typing import Any, Dict, Optional, Callable - -from codexlens.mcp.provider import MCPProvider -from codexlens.mcp.schema import MCPContext - -logger = logging.getLogger(__name__) - - -class HookManager: - """Manages hook registration and execution.""" - - def __init__(self, mcp_provider: MCPProvider): - self.mcp_provider = mcp_provider - self._pre_hooks: Dict[str, Callable] = {} - self._post_hooks: Dict[str, Callable] = {} - - # Register default hooks - self._register_default_hooks() - - def _register_default_hooks(self): - """Register built-in hooks.""" - self._pre_hooks["explain"] = self._pre_explain_hook - self._pre_hooks["refactor"] = self._pre_refactor_hook - self._pre_hooks["document"] = self._pre_document_hook - - def execute_pre_hook( - self, - action: str, - params: Dict[str, Any], - ) -> Optional[MCPContext]: - """Execute pre-tool hook to gather context. - - Args: - action: The action being performed (e.g., "explain", "refactor") - params: Parameters for the action - - Returns: - MCPContext to inject into prompt, or None - """ - hook = self._pre_hooks.get(action) - - if not hook: - logger.debug(f"No pre-hook for action: {action}") - return None - - try: - return hook(params) - except Exception as e: - logger.error(f"Pre-hook failed for {action}: {e}") - return None - - def execute_post_hook( - self, - action: str, - result: Any, - ) -> None: - """Execute post-tool hook for proactive caching. - - Args: - action: The action that was performed - result: Result of the action - """ - hook = self._post_hooks.get(action) - - if not hook: - return - - try: - hook(result) - except Exception as e: - logger.error(f"Post-hook failed for {action}: {e}") - - def _pre_explain_hook(self, params: Dict[str, Any]) -> Optional[MCPContext]: - """Pre-hook for 'explain' action.""" - symbol_name = params.get("symbol") - - if not symbol_name: - return None - - return self.mcp_provider.build_context( - symbol_name=symbol_name, - context_type="symbol_explanation", - include_references=True, - include_related=True, - ) - - def _pre_refactor_hook(self, params: Dict[str, Any]) -> Optional[MCPContext]: - """Pre-hook for 'refactor' action.""" - symbol_name = params.get("symbol") - - if not symbol_name: - return None - - return self.mcp_provider.build_context( - symbol_name=symbol_name, - context_type="refactor_context", - include_references=True, # Important for refactoring - include_related=True, - max_references=20, # More references for refactoring - ) - - def _pre_document_hook(self, params: Dict[str, Any]) -> Optional[MCPContext]: - """Pre-hook for 'document' action.""" - symbol_name = params.get("symbol") - file_path = params.get("file_path") - - if symbol_name: - return self.mcp_provider.build_context( - symbol_name=symbol_name, - context_type="documentation_context", - include_references=False, - include_related=True, - ) - elif file_path: - return self.mcp_provider.build_context_for_file( - Path(file_path), - context_type="file_documentation", - ) - - return None - - def register_pre_hook( - self, - action: str, - hook: Callable[[Dict[str, Any]], Optional[MCPContext]], - ) -> None: - """Register a custom pre-tool hook.""" - self._pre_hooks[action] = hook - - def register_post_hook( - self, - action: str, - hook: Callable[[Any], None], - ) -> None: - """Register a custom post-tool hook.""" - self._post_hooks[action] = hook - - -# Convenience function for Claude Code integration -def create_context_for_prompt( - mcp_provider: MCPProvider, - action: str, - params: Dict[str, Any], -) -> str: - """Create context string for prompt injection. - - This is the main entry point for Claude Code hook integration. - - Args: - mcp_provider: The MCP provider instance - action: Action being performed - params: Action parameters - - Returns: - Formatted context string for prompt injection - """ - manager = HookManager(mcp_provider) - context = manager.execute_pre_hook(action, params) - - if context: - return context.to_prompt_injection() - - return "" -``` - ---- - -## 8. Phase 5: Advanced Features - -### 8.1 Custom LSP Commands - -**File**: `src/codexlens/lsp/handlers.py` (APPEND) - -```python -# Custom commands for advanced features - -@server.command("codexlens.hybridSearch") -def cmd_hybrid_search(params: List[Any]) -> dict: - """Execute hybrid search combining FTS and semantic.""" - if len(params) < 1: - return {"error": "Query required"} - - query = params[0] - limit = params[1] if len(params) > 1 else 20 - - from codexlens.search.hybrid_search import HybridSearchEngine - - engine = HybridSearchEngine(server.search_engine.store) - results = engine.search(query, limit=limit) - - return { - "results": [ - { - "path": r.path, - "score": r.score, - "excerpt": r.excerpt, - } - for r in results - ] - } - - -@server.command("codexlens.getMCPContext") -def cmd_get_mcp_context(params: List[Any]) -> dict: - """Get MCP context for a symbol.""" - if len(params) < 1: - return {"error": "Symbol name required"} - - symbol_name = params[0] - context_type = params[1] if len(params) > 1 else "symbol_explanation" - - from codexlens.mcp.provider import MCPProvider - - provider = MCPProvider( - server.global_index, - server.search_engine, - server.registry, - ) - - context = provider.build_context(symbol_name, context_type) - - if context: - return context.to_dict() - - return {"error": "Symbol not found"} -``` - -### 8.2 Performance Optimizations - -**File**: `src/codexlens/lsp/cache.py` (NEW) - -```python -"""Caching layer for LSP performance.""" - -import time -from functools import lru_cache -from typing import Any, Dict, Optional -from threading import Lock - - -class LRUCacheWithTTL: - """LRU cache with time-to-live expiration.""" - - def __init__(self, maxsize: int = 1000, ttl_seconds: int = 300): - self.maxsize = maxsize - self.ttl = ttl_seconds - self._cache: Dict[str, tuple] = {} # key -> (value, timestamp) - self._lock = Lock() - - def get(self, key: str) -> Optional[Any]: - """Get value from cache if not expired.""" - with self._lock: - if key not in self._cache: - return None - - value, timestamp = self._cache[key] - - if time.time() - timestamp > self.ttl: - del self._cache[key] - return None - - return value - - def set(self, key: str, value: Any) -> None: - """Set value in cache.""" - with self._lock: - # Evict oldest if at capacity - if len(self._cache) >= self.maxsize: - oldest_key = min( - self._cache.keys(), - key=lambda k: self._cache[k][1], - ) - del self._cache[oldest_key] - - self._cache[key] = (value, time.time()) - - def invalidate(self, key: str) -> None: - """Remove key from cache.""" - with self._lock: - self._cache.pop(key, None) - - def invalidate_prefix(self, prefix: str) -> None: - """Remove all keys with given prefix.""" - with self._lock: - keys_to_remove = [ - k for k in self._cache.keys() - if k.startswith(prefix) - ] - for key in keys_to_remove: - del self._cache[key] - - def clear(self) -> None: - """Clear all cache entries.""" - with self._lock: - self._cache.clear() - - -# Global cache instances -definition_cache = LRUCacheWithTTL(maxsize=500, ttl_seconds=300) -references_cache = LRUCacheWithTTL(maxsize=200, ttl_seconds=60) -completion_cache = LRUCacheWithTTL(maxsize=100, ttl_seconds=30) -``` - ---- - -## 9. Testing Strategy - -### 9.1 Test Structure - -``` -tests/ -├── lsp/ -│ ├── __init__.py -│ ├── conftest.py # Fixtures -│ ├── test_server.py # Server lifecycle -│ ├── test_definition.py # Definition handler -│ ├── test_references.py # References handler -│ ├── test_completion.py # Completion handler -│ ├── test_hover.py # Hover handler -│ └── test_workspace_symbol.py # Workspace symbol -│ -├── mcp/ -│ ├── __init__.py -│ ├── test_schema.py # MCP schema validation -│ ├── test_provider.py # Context building -│ └── test_hooks.py # Hook execution -│ -└── integration/ - ├── __init__.py - ├── test_lsp_client.py # Full LSP handshake - └── test_mcp_flow.py # End-to-end MCP flow -``` - -### 9.2 Fixtures - -**File**: `tests/lsp/conftest.py` - -```python -"""Test fixtures for LSP tests.""" - -import pytest -from pathlib import Path -import tempfile -import shutil - -from codexlens.lsp.server import CodexLensLanguageServer - - -@pytest.fixture -def temp_workspace(): - """Create temporary workspace with sample files.""" - tmpdir = Path(tempfile.mkdtemp()) - - # Create sample Python files - (tmpdir / "main.py").write_text(""" -def main(): - result = helper_function(42) - print(result) - -def helper_function(x): - return x * 2 -""") - - (tmpdir / "utils.py").write_text(""" -from main import helper_function - -class Calculator: - def add(self, a, b): - return a + b - - def multiply(self, a, b): - return helper_function(a) * b -""") - - yield tmpdir - - shutil.rmtree(tmpdir) - - -@pytest.fixture -def indexed_workspace(temp_workspace): - """Workspace with built indexes.""" - from codexlens.cli.commands import index_directory - - index_directory(temp_workspace) - - return temp_workspace - - -@pytest.fixture -def lsp_server(indexed_workspace): - """Initialized LSP server.""" - server = CodexLensLanguageServer("test", "0.1.0") - server.initialize_codexlens(indexed_workspace) - - yield server - - server.shutdown_codexlens() -``` - -### 9.3 Performance Benchmarks - -**File**: `tests/benchmarks/test_performance.py` - -```python -"""Performance benchmarks for LSP operations.""" - -import pytest -import time - - -class TestPerformance: - """Performance benchmark tests.""" - - @pytest.mark.benchmark - def test_definition_latency(self, lsp_server, benchmark): - """Definition lookup should be < 50ms.""" - def lookup(): - return lsp_server.global_index.search("helper_function", exact=True) - - result = benchmark(lookup) - assert benchmark.stats.stats.mean < 0.05 # 50ms - - @pytest.mark.benchmark - def test_completion_latency(self, lsp_server, benchmark): - """Completion should be < 100ms.""" - def complete(): - return lsp_server.global_index.search("help", prefix_mode=True, limit=50) - - result = benchmark(complete) - assert benchmark.stats.stats.mean < 0.1 # 100ms - - @pytest.mark.benchmark - def test_references_latency(self, lsp_server, benchmark): - """References should be < 200ms.""" - def find_refs(): - return lsp_server.search_engine.search_references("helper_function") - - result = benchmark(find_refs) - assert benchmark.stats.stats.mean < 0.2 # 200ms -``` - ---- - -## 10. Deployment Guide - -### 10.1 Installation - -```bash -# Install with LSP support -pip install codex-lens[lsp] - -# Or from source -git clone https://github.com/your-org/codex-lens.git -cd codex-lens -pip install -e ".[lsp]" -``` - -### 10.2 VS Code Configuration - -**File**: `.vscode/settings.json` - -```json -{ - "codexlens.enable": true, - "codexlens.serverPath": "codexlens-lsp", - "codexlens.serverArgs": ["--stdio"], - "codexlens.trace.server": "verbose" -} -``` - -### 10.3 Neovim Configuration - -**File**: `~/.config/nvim/lua/lsp/codexlens.lua` - -```lua -local lspconfig = require('lspconfig') -local configs = require('lspconfig.configs') - -configs.codexlens = { - default_config = { - cmd = { 'codexlens-lsp', '--stdio' }, - filetypes = { 'python', 'javascript', 'typescript' }, - root_dir = lspconfig.util.root_pattern('.git', 'pyproject.toml'), - settings = {}, - }, -} - -lspconfig.codexlens.setup{} -``` - -### 10.4 Claude Code Integration - -**File**: `~/.claude/hooks/pre-tool.sh` - -```bash -#!/bin/bash -# Pre-tool hook for Claude Code - -ACTION="$1" -PARAMS="$2" - -# Call codex-lens MCP provider -python -c " -from codexlens.mcp.hooks import create_context_for_prompt -from codexlens.mcp.provider import MCPProvider -from codexlens.storage.global_index import GlobalSymbolIndex -from codexlens.search.chain_search import ChainSearchEngine -from codexlens.storage.registry import RegistryStore -from codexlens.storage.path_mapper import PathMapper -import json - -# Initialize components -registry = RegistryStore() -registry.initialize() -mapper = PathMapper() -search = ChainSearchEngine(registry, mapper) -global_idx = GlobalSymbolIndex(Path.cwd()) - -provider = MCPProvider(global_idx, search, registry) - -params = json.loads('$PARAMS') -context = create_context_for_prompt(provider, '$ACTION', params) -print(context) -" -``` - ---- - -## 11. Risk Mitigation - -### 11.1 Risk Matrix - -| Risk | Probability | Impact | Mitigation | -|------|-------------|--------|------------| -| pygls compatibility issues | Low | High | Pin version, test on multiple platforms | -| Performance degradation | Medium | Medium | Implement caching, benchmark tests | -| Index corruption | Low | High | Use WAL mode, implement recovery | -| Memory leaks in long sessions | Medium | Medium | Implement connection pooling, periodic cleanup | -| Hook execution timeout | Medium | Low | Implement timeout limits, async execution | - -### 11.2 Fallback Strategies - -1. **Index not available**: Return empty results, don't block LSP -2. **Search timeout**: Return partial results with warning -3. **WatcherManager crash**: Auto-restart with exponential backoff -4. **MCP generation failure**: Return minimal context, log error - -### 11.3 Monitoring - -```python -# Add to server.py - -import prometheus_client - -# Metrics -DEFINITION_LATENCY = prometheus_client.Histogram( - 'codexlens_definition_latency_seconds', - 'Time to process definition request', -) -REFERENCES_LATENCY = prometheus_client.Histogram( - 'codexlens_references_latency_seconds', - 'Time to process references request', -) -INDEX_SIZE = prometheus_client.Gauge( - 'codexlens_index_symbols_total', - 'Total symbols in index', -) -``` - ---- - -## Appendix: Quick Reference - -### File Creation Summary - -| Phase | File | Type | -|-------|------|------| -| 1 | `src/codexlens/lsp/__init__.py` | NEW | -| 1 | `src/codexlens/lsp/server.py` | NEW | -| 1 | `src/codexlens/lsp/handlers.py` | NEW | -| 2 | `src/codexlens/search/chain_search.py` | MODIFY | -| 3 | `src/codexlens/lsp/providers.py` | NEW | -| 4 | `src/codexlens/mcp/__init__.py` | NEW | -| 4 | `src/codexlens/mcp/schema.py` | NEW | -| 4 | `src/codexlens/mcp/provider.py` | NEW | -| 4 | `src/codexlens/mcp/hooks.py` | NEW | -| 5 | `src/codexlens/lsp/cache.py` | NEW | - -### Command Reference - -```bash -# Start LSP server -codexlens-lsp --stdio - -# Start with TCP (for debugging) -codexlens-lsp --tcp --port 2087 - -# Run tests -pytest tests/lsp/ -v - -# Run benchmarks -pytest tests/benchmarks/ --benchmark-only - -# Check coverage -pytest tests/lsp/ --cov=codexlens.lsp --cov-report=html -``` - ---- - -**Document End** diff --git a/codex-lens/docs/MCP_ENDPOINT_DESIGN.md b/codex-lens/docs/MCP_ENDPOINT_DESIGN.md deleted file mode 100644 index 887bbc26..00000000 --- a/codex-lens/docs/MCP_ENDPOINT_DESIGN.md +++ /dev/null @@ -1,284 +0,0 @@ -# CodexLens MCP Endpoint Design - -> Generated by Gemini Analysis | 2026-01-19 -> Document Version: 1.0 - -## Overview - -This document provides the complete MCP endpoint design for exposing codex-lens LSP capabilities through the Model Context Protocol. - -## Related Files -- `src/codexlens/lsp/server.py` - Main LSP server initialization, component management, and capability declaration. -- `src/codexlens/lsp/handlers.py` - Implementation of handlers for core LSP requests (definition, references, completion, hover, workspace symbols). -- `src/codexlens/lsp/providers.py` - Helper classes, specifically `HoverProvider` for generating rich hover information. -- `src/codexlens/storage/global_index.py` - The backing data store (`GlobalSymbolIndex`) that powers most of the symbol lookups. -- `src/codexlens/search/__init__.py` - Exposes the `ChainSearchEngine`, used for advanced reference searching. - -## Summary - -The `codex-lens` LSP implementation exposes five core code navigation and search features: go to definition, find references, code completion, hover information, and workspace symbol search. These features are primarily powered by two components: `GlobalSymbolIndex` for fast, project-wide symbol lookups (used by definition, completion, hover, and workspace symbols) and `ChainSearchEngine` for advanced, relationship-aware reference finding. - -The following MCP tool design externalizes these backend capabilities, allowing a client to leverage the same code intelligence features outside of an LSP context. - -## MCP Tool Group: `code.symbol` - -This group provides tools for searching and retrieving information about code symbols (functions, classes, etc.) within an indexed project. - ---- - -### 1. `code.symbol.search` - -**Description**: Searches for symbols across the entire indexed project, supporting prefix or contains matching. Ideal for implementing workspace symbol searches or providing code completion suggestions. - -**Mapped LSP Features**: `workspace/symbol`, `textDocument/completion` - -**Backend Implementation**: This tool directly maps to the `GlobalSymbolIndex.search` method. -- Reference: `src/codexlens/lsp/handlers.py:302` (in `lsp_workspace_symbol`) -- Reference: `src/codexlens/lsp/handlers.py:256` (in `lsp_completion`) - -**Schema**: -```json -{ - "name": "code.symbol.search", - "description": "Searches for symbols across the entire indexed project, supporting prefix or contains matching. Ideal for implementing workspace symbol searches or providing code completion suggestions.", - "inputSchema": { - "type": "object", - "properties": { - "query": { - "type": "string", - "description": "The symbol name or prefix to search for." - }, - "kind": { - "type": "string", - "description": "Optional: Filter results to only include symbols of a specific kind (e.g., 'function', 'class', 'method').", - "nullable": true - }, - "prefix_mode": { - "type": "boolean", - "description": "If true, treats the query as a prefix (name LIKE 'query%'). If false, performs a contains search (name LIKE '%query%'). Defaults to true.", - "default": true - }, - "limit": { - "type": "integer", - "description": "The maximum number of symbols to return.", - "default": 50 - } - }, - "required": ["query"] - } -} -``` - -**Returns**: -```typescript -Array<{ - name: string; // The name of the symbol - kind: string; // The kind of the symbol (e.g., 'function', 'class') - file_path: string; // The absolute path to the file containing the symbol - range: { - start_line: number; // The 1-based starting line number - end_line: number; // The 1-based ending line number - } -}> -``` - ---- - -### 2. `code.symbol.findDefinition` - -**Description**: Finds the definition location(s) for a symbol with an exact name match. This corresponds to a 'Go to Definition' feature. - -**Mapped LSP Feature**: `textDocument/definition` - -**Backend Implementation**: This tool uses `GlobalSymbolIndex.search` with `prefix_mode=False` and then filters for an exact name match. -- Reference: `src/codexlens/lsp/handlers.py:180` (in `lsp_definition`) - -**Schema**: -```json -{ - "name": "code.symbol.findDefinition", - "description": "Finds the definition location(s) for a symbol with an exact name match. This corresponds to a 'Go to Definition' feature.", - "inputSchema": { - "type": "object", - "properties": { - "symbol_name": { - "type": "string", - "description": "The exact name of the symbol to find." - }, - "kind": { - "type": "string", - "description": "Optional: Disambiguate by providing the symbol kind (e.g., 'function', 'class').", - "nullable": true - } - }, - "required": ["symbol_name"] - } -} -``` - -**Returns**: -```typescript -Array<{ - name: string; // The name of the symbol - kind: string; // The kind of the symbol - file_path: string; // The absolute path to the file - range: { - start_line: number; // The 1-based starting line number - end_line: number; // The 1-based ending line number - } -}> -``` - ---- - -### 3. `code.symbol.findReferences` - -**Description**: Finds all references to a symbol throughout the project. Uses advanced relationship analysis for accuracy where possible, falling back to name-based search. - -**Mapped LSP Feature**: `textDocument/references` - -**Backend Implementation**: This primarily uses `ChainSearchEngine.search_references` for accuracy, which is more powerful than a simple name search. -- Reference: `src/codexlens/lsp/handlers.py:218` (in `lsp_references`) - -**Schema**: -```json -{ - "name": "code.symbol.findReferences", - "description": "Finds all references to a symbol throughout the project. Uses advanced relationship analysis for accuracy where possible.", - "inputSchema": { - "type": "object", - "properties": { - "symbol_name": { - "type": "string", - "description": "The name of the symbol to find references for." - }, - "context_path": { - "type": "string", - "description": "The source path of the current project or workspace root to provide context for the search." - }, - "limit": { - "type": "integer", - "description": "The maximum number of references to return.", - "default": 200 - } - }, - "required": ["symbol_name", "context_path"] - } -} -``` - -**Returns**: -```typescript -Array<{ - file_path: string; // The absolute path to the file containing the reference - line: number; // The 1-based line number of the reference - column: number; // The 0-based starting column of the reference -}> -``` - ---- - -### 4. `code.symbol.getHoverInfo` - -**Description**: Retrieves rich information for a symbol, including its signature and location, suitable for displaying in a hover card. - -**Mapped LSP Feature**: `textDocument/hover` - -**Backend Implementation**: This tool encapsulates the logic from `HoverProvider`, which finds a symbol in `GlobalSymbolIndex` and then reads the source file to extract its signature. -- Reference: `src/codexlens/lsp/handlers.py:285` (instantiates `HoverProvider`) -- Reference: `src/codexlens/lsp/providers.py:53` (in `HoverProvider.get_hover_info`) - -**Schema**: -```json -{ - "name": "code.symbol.getHoverInfo", - "description": "Retrieves rich information for a symbol, including its signature and location, suitable for displaying in a hover card.", - "inputSchema": { - "type": "object", - "properties": { - "symbol_name": { - "type": "string", - "description": "The exact name of the symbol to get hover information for." - } - }, - "required": ["symbol_name"] - } -} -``` - -**Returns**: -```typescript -{ - name: string; // The name of the symbol - kind: string; // The kind of the symbol - signature: string; // The full code signature as extracted from source - file_path: string; // The absolute path to the file - start_line: number; // The 1-based starting line number -} | null // null if symbol not found -``` - ---- - -## Integration with CCW MCP Manager - -The `codex-lens-tools` MCP server should be added to the recommended MCP servers list in `ccw/src/templates/dashboard-js/components/mcp-manager.js`: - -```javascript -{ - id: 'codex-lens-tools', - nameKey: 'mcp.codexLens.name', - descKey: 'mcp.codexLens.desc', - icon: 'search-code', - category: 'code-intelligence', - fields: [ - { - key: 'toolSelection', - labelKey: 'mcp.codexLens.field.tools', - type: 'multi-select', - options: [ - { value: 'symbol.search', label: 'Symbol Search' }, - { value: 'symbol.findDefinition', label: 'Find Definition' }, - { value: 'symbol.findReferences', label: 'Find References' }, - { value: 'symbol.getHoverInfo', label: 'Hover Information' } - ], - default: ['symbol.search', 'symbol.findDefinition', 'symbol.findReferences'], - required: true, - descKey: 'mcp.codexLens.field.tools.desc' - } - ], - buildConfig: (values) => { - const tools = values.toolSelection || []; - const env = { CODEXLENS_ENABLED_TOOLS: tools.join(',') }; - return buildCrossPlatformMcpConfig('npx', ['-y', 'codex-lens-mcp'], { env }); - } -} -``` - -## Tool Naming Convention - -- **Namespace**: `code.*` for code intelligence tools -- **Category**: `symbol` for symbol-related operations -- **Operation**: Descriptive verb (search, findDefinition, findReferences, getHoverInfo) -- **Full Pattern**: `code.symbol.` - -This naming scheme aligns with MCP conventions and is easily extensible for future categories (e.g., `code.types.*`, `code.imports.*`). - -## Future Enhancements - -1. **Document Symbol Tool** (`code.symbol.getDocumentSymbols`) - - Maps LSP `textDocument/documentSymbol` - - Returns all symbols in a specific file - -2. **Type Information** (`code.type.*` group) - - Type definitions and relationships - - Generic resolution - -3. **Relationship Analysis** (`code.relation.*` group) - - Call hierarchy - - Inheritance chains - - Import dependencies - ---- - -Generated: 2026-01-19 -Status: Ready for Implementation diff --git a/codex-lens/docs/MIGRATION_005_SUMMARY.md b/codex-lens/docs/MIGRATION_005_SUMMARY.md deleted file mode 100644 index c73cd06d..00000000 --- a/codex-lens/docs/MIGRATION_005_SUMMARY.md +++ /dev/null @@ -1,220 +0,0 @@ -# Migration 005: Database Schema Cleanup - -## Overview - -Migration 005 removes four unused and redundant database fields identified through Gemini analysis. This cleanup improves database efficiency, reduces schema complexity, and eliminates potential data consistency issues. - -## Schema Version - -- **Previous Version**: 4 -- **New Version**: 5 - -## Changes Summary - -### 1. Removed `semantic_metadata.keywords` Column - -**Reason**: Deprecated - replaced by normalized `file_keywords` table in migration 001. - -**Impact**: -- Keywords are now exclusively read from the normalized `file_keywords` table -- Prevents data sync issues between JSON column and normalized tables -- No data loss - migration 001 already populated `file_keywords` table - -**Modified Code**: -- `get_semantic_metadata()`: Now reads keywords from `file_keywords` JOIN -- `list_semantic_metadata()`: Updated to query `file_keywords` for each result -- `add_semantic_metadata()`: Stopped writing to `keywords` column (only writes to `file_keywords`) - -### 2. Removed `symbols.token_count` Column - -**Reason**: Unused - always NULL, never populated. - -**Impact**: -- No data loss (column was never used) -- Reduces symbols table size -- Simplifies symbol insertion logic - -**Modified Code**: -- `add_file()`: Removed `token_count` from INSERT statements -- `update_file_symbols()`: Removed `token_count` from INSERT statements -- Schema creation: No longer creates `token_count` column - -### 3. Removed `symbols.symbol_type` Column - -**Reason**: Redundant - duplicates `symbols.kind` field. - -**Impact**: -- No data loss (information preserved in `kind` column) -- Reduces symbols table size -- Eliminates redundant data storage - -**Modified Code**: -- `add_file()`: Removed `symbol_type` from INSERT statements -- `update_file_symbols()`: Removed `symbol_type` from INSERT statements -- Schema creation: No longer creates `symbol_type` column -- Removed `idx_symbols_type` index - -### 4. Removed `subdirs.direct_files` Column - -**Reason**: Unused - never displayed or queried in application logic. - -**Impact**: -- No data loss (column was never used) -- Reduces subdirs table size -- Simplifies subdirectory registration - -**Modified Code**: -- `register_subdir()`: Parameter kept for backward compatibility but ignored -- `update_subdir_stats()`: Parameter kept for backward compatibility but ignored -- `get_subdirs()`: No longer retrieves `direct_files` -- `get_subdir()`: No longer retrieves `direct_files` -- `SubdirLink` dataclass: Removed `direct_files` field - -## Migration Process - -### Automatic Migration (v4 → v5) - -When an existing database (version 4) is opened: - -1. **Transaction begins** -2. **Step 1**: Recreate `semantic_metadata` table without `keywords` column - - Data copied from old table (excluding `keywords`) - - Old table dropped, new table renamed -3. **Step 2**: Recreate `symbols` table without `token_count` and `symbol_type` - - Data copied from old table (excluding removed columns) - - Old table dropped, new table renamed - - Indexes recreated (excluding `idx_symbols_type`) -4. **Step 3**: Recreate `subdirs` table without `direct_files` - - Data copied from old table (excluding `direct_files`) - - Old table dropped, new table renamed -5. **Transaction committed** -6. **VACUUM** runs to reclaim space (non-critical, continues if fails) - -### New Database Creation (v5) - -New databases are created directly with the clean schema (no migration needed). - -## Benefits - -1. **Reduced Database Size**: Removed 4 unused columns across 3 tables -2. **Improved Data Consistency**: Single source of truth for keywords (normalized tables) -3. **Simpler Code**: Less maintenance burden for unused fields -4. **Better Performance**: Smaller table sizes, fewer indexes to maintain -5. **Cleaner Schema**: Easier to understand and maintain - -## Backward Compatibility - -### API Compatibility - -All public APIs remain backward compatible: - -- `register_subdir()` and `update_subdir_stats()` still accept `direct_files` parameter (ignored) -- `SubdirLink` dataclass no longer has `direct_files` attribute (breaking change for direct dataclass access) - -### Database Compatibility - -- **v4 databases**: Automatically migrated to v5 on first access -- **v5 databases**: No migration needed -- **Older databases (v0-v3)**: Migrate through chain (v0→v2→v4→v5) - -## Testing - -Comprehensive test suite added: `tests/test_schema_cleanup_migration.py` - -**Test Coverage**: -- ✅ Migration from v4 to v5 -- ✅ New database creation with clean schema -- ✅ Semantic metadata keywords read from normalized table -- ✅ Symbols insert without deprecated fields -- ✅ Subdir operations without `direct_files` - -**Test Results**: All 5 tests passing - -## Verification - -To verify migration success: - -```python -from codexlens.storage.dir_index import DirIndexStore - -store = DirIndexStore("path/to/_index.db") -store.initialize() - -# Check schema version -conn = store._get_connection() -version = conn.execute("PRAGMA user_version").fetchone()[0] -assert version == 5 - -# Check columns removed -cursor = conn.execute("PRAGMA table_info(semantic_metadata)") -columns = {row[1] for row in cursor.fetchall()} -assert "keywords" not in columns - -cursor = conn.execute("PRAGMA table_info(symbols)") -columns = {row[1] for row in cursor.fetchall()} -assert "token_count" not in columns -assert "symbol_type" not in columns - -cursor = conn.execute("PRAGMA table_info(subdirs)") -columns = {row[1] for row in cursor.fetchall()} -assert "direct_files" not in columns - -store.close() -``` - -## Performance Impact - -**Expected Improvements**: -- Database size reduction: ~10-15% (varies by data) -- VACUUM reclaims space immediately after migration -- Slightly faster queries (smaller tables, fewer indexes) - -## Rollback - -Migration 005 is **one-way** (no downgrade function). Removed fields contain: -- `keywords`: Already migrated to normalized tables (migration 001) -- `token_count`: Always NULL (no data) -- `symbol_type`: Duplicate of `kind` (no data loss) -- `direct_files`: Never used (no data) - -If rollback is needed, restore from backup before running migration. - -## Files Modified - -1. **Migration File**: - - `src/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py` (NEW) - -2. **Core Storage**: - - `src/codexlens/storage/dir_index.py`: - - Updated `SCHEMA_VERSION` to 5 - - Added migration 005 to `_apply_migrations()` - - Updated `get_semantic_metadata()` to read from `file_keywords` - - Updated `list_semantic_metadata()` to read from `file_keywords` - - Updated `add_semantic_metadata()` to not write `keywords` column - - Updated `add_file()` to not write `token_count`/`symbol_type` - - Updated `update_file_symbols()` to not write `token_count`/`symbol_type` - - Updated `register_subdir()` to not write `direct_files` - - Updated `update_subdir_stats()` to not write `direct_files` - - Updated `get_subdirs()` to not read `direct_files` - - Updated `get_subdir()` to not read `direct_files` - - Updated `SubdirLink` dataclass to remove `direct_files` - - Updated `_create_schema()` to create v5 schema directly - -3. **Tests**: - - `tests/test_schema_cleanup_migration.py` (NEW) - -## Deployment Checklist - -- [x] Migration script created and tested -- [x] Schema version updated to 5 -- [x] All code updated to use new schema -- [x] Comprehensive tests added -- [x] Existing tests pass -- [x] Documentation updated -- [x] Backward compatibility verified - -## References - -- Original Analysis: Gemini code review identified unused/redundant fields -- Migration Pattern: Follows SQLite best practices (table recreation) -- Previous Migrations: 001 (keywords normalization), 004 (dual FTS) diff --git a/codex-lens/docs/MULTILEVEL_CHUNKER_DESIGN.md b/codex-lens/docs/MULTILEVEL_CHUNKER_DESIGN.md deleted file mode 100644 index 46cee0a1..00000000 --- a/codex-lens/docs/MULTILEVEL_CHUNKER_DESIGN.md +++ /dev/null @@ -1,973 +0,0 @@ -# 多层次分词器设计方案 - -## 1. 背景与目标 - -### 1.1 当前问题 - -当前 `chunker.py` 的两种分词策略存在明显缺陷: - -**symbol-based 策略**: -- ✅ 优点:保持代码逻辑完整性,每个chunk是完整的函数/类 -- ❌ 缺点:粒度不均,超大函数可能达到数百行,影响LLM处理和搜索精度 - -**sliding-window 策略**: -- ✅ 优点:chunk大小均匀,覆盖全面 -- ❌ 缺点:破坏逻辑结构,可能将完整的循环/条件块切断 - -### 1.2 设计目标 - -实现多层次分词器,同时满足: -1. **语义完整性**:保持代码逻辑边界的完整性 -2. **粒度可控**:支持从粗粒度(函数级)到细粒度(逻辑块级)的灵活划分 -3. **层级关系**:保留chunk之间的父子关系,支持上下文检索 -4. **高效索引**:优化向量化和检索性能 - -## 2. 技术架构 - -### 2.1 两层分词架构 - -``` -Source Code - ↓ -[Layer 1: Symbol-Level Chunking] ← 使用 tree-sitter AST - ↓ -MacroChunks (Functions/Classes) - ↓ -[Layer 2: Logic-Block Chunking] ← AST深度遍历 - ↓ -MicroChunks (Loops/Conditionals/Blocks) - ↓ -Vector Embedding + Indexing -``` - -### 2.2 核心组件 - -```python -# 新增数据结构 -@dataclass -class ChunkMetadata: - """Chunk元数据""" - chunk_id: str - parent_id: Optional[str] # 父chunk ID - level: int # 层级:1=macro, 2=micro - chunk_type: str # function/class/loop/conditional/try_except - file_path: str - start_line: int - end_line: int - symbol_name: Optional[str] - context_summary: Optional[str] # 继承自父chunk的上下文 - -@dataclass -class HierarchicalChunk: - """层级化的代码块""" - metadata: ChunkMetadata - content: str - embedding: Optional[List[float]] = None - children: List['HierarchicalChunk'] = field(default_factory=list) -``` - -## 3. 详细实现步骤 - -### 3.1 第一层:符号级分词(Macro-Chunking) - -**实现思路**:复用现有 `code_extractor.py` 逻辑,增强元数据提取。 - -```python -class MacroChunker: - """第一层分词器:提取顶层符号""" - - def __init__(self): - self.parser = Parser() - # 加载语言grammar - - def chunk_by_symbols( - self, - content: str, - file_path: str, - language: str - ) -> List[HierarchicalChunk]: - """提取顶层函数和类定义""" - tree = self.parser.parse(bytes(content, 'utf-8')) - root_node = tree.root_node - - chunks = [] - for node in root_node.children: - if node.type in ['function_definition', 'class_definition', - 'method_definition']: - chunk = self._create_macro_chunk(node, content, file_path) - chunks.append(chunk) - - return chunks - - def _create_macro_chunk( - self, - node, - content: str, - file_path: str - ) -> HierarchicalChunk: - """从AST节点创建macro chunk""" - start_line = node.start_point[0] + 1 - end_line = node.end_point[0] + 1 - - # 提取符号名称 - name_node = node.child_by_field_name('name') - symbol_name = content[name_node.start_byte:name_node.end_byte] - - # 提取完整代码(包含docstring和装饰器) - chunk_content = self._extract_with_context(node, content) - - metadata = ChunkMetadata( - chunk_id=f"{file_path}:{start_line}", - parent_id=None, - level=1, - chunk_type=node.type, - file_path=file_path, - start_line=start_line, - end_line=end_line, - symbol_name=symbol_name, - ) - - return HierarchicalChunk( - metadata=metadata, - content=chunk_content, - ) - - def _extract_with_context(self, node, content: str) -> str: - """提取代码,包含装饰器和docstring""" - # 向上查找装饰器 - start_byte = node.start_byte - prev_sibling = node.prev_sibling - while prev_sibling and prev_sibling.type == 'decorator': - start_byte = prev_sibling.start_byte - prev_sibling = prev_sibling.prev_sibling - - return content[start_byte:node.end_byte] -``` - -### 3.2 第二层:逻辑块分词(Micro-Chunking) - -**实现思路**:在每个macro chunk内部,按逻辑结构进一步划分。 - -```python -class MicroChunker: - """第二层分词器:提取逻辑块""" - - # 需要划分的逻辑块类型 - LOGIC_BLOCK_TYPES = { - 'for_statement', - 'while_statement', - 'if_statement', - 'try_statement', - 'with_statement', - } - - def chunk_logic_blocks( - self, - macro_chunk: HierarchicalChunk, - content: str, - max_lines: int = 50 # 大于此行数的macro chunk才进行二次划分 - ) -> List[HierarchicalChunk]: - """在macro chunk内部提取逻辑块""" - - # 小函数不需要二次划分 - total_lines = macro_chunk.metadata.end_line - macro_chunk.metadata.start_line - if total_lines <= max_lines: - return [] - - tree = self.parser.parse(bytes(macro_chunk.content, 'utf-8')) - root_node = tree.root_node - - micro_chunks = [] - self._traverse_logic_blocks( - root_node, - macro_chunk, - content, - micro_chunks - ) - - return micro_chunks - - def _traverse_logic_blocks( - self, - node, - parent_chunk: HierarchicalChunk, - content: str, - result: List[HierarchicalChunk] - ): - """递归遍历AST,提取逻辑块""" - - if node.type in self.LOGIC_BLOCK_TYPES: - micro_chunk = self._create_micro_chunk( - node, - parent_chunk, - content - ) - result.append(micro_chunk) - parent_chunk.children.append(micro_chunk) - - # 继续遍历子节点 - for child in node.children: - self._traverse_logic_blocks(child, parent_chunk, content, result) - - def _create_micro_chunk( - self, - node, - parent_chunk: HierarchicalChunk, - content: str - ) -> HierarchicalChunk: - """创建micro chunk""" - - # 计算相对于文件的行号 - start_line = parent_chunk.metadata.start_line + node.start_point[0] - end_line = parent_chunk.metadata.start_line + node.end_point[0] - - chunk_content = content[node.start_byte:node.end_byte] - - metadata = ChunkMetadata( - chunk_id=f"{parent_chunk.metadata.chunk_id}:L{start_line}", - parent_id=parent_chunk.metadata.chunk_id, - level=2, - chunk_type=node.type, - file_path=parent_chunk.metadata.file_path, - start_line=start_line, - end_line=end_line, - symbol_name=parent_chunk.metadata.symbol_name, # 继承父符号名 - context_summary=None, # 后续由LLM填充 - ) - - return HierarchicalChunk( - metadata=metadata, - content=chunk_content, - ) -``` - -### 3.3 统一接口:多层次分词器 - -```python -class HierarchicalChunker: - """多层次分词器统一接口""" - - def __init__(self, config: ChunkConfig = None): - self.config = config or ChunkConfig() - self.macro_chunker = MacroChunker() - self.micro_chunker = MicroChunker() - - def chunk_file( - self, - content: str, - file_path: str, - language: str - ) -> List[HierarchicalChunk]: - """对文件进行多层次分词""" - - # 第一层:符号级分词 - macro_chunks = self.macro_chunker.chunk_by_symbols( - content, file_path, language - ) - - # 第二层:逻辑块分词 - all_chunks = [] - for macro_chunk in macro_chunks: - all_chunks.append(macro_chunk) - - # 对大函数进行二次划分 - micro_chunks = self.micro_chunker.chunk_logic_blocks( - macro_chunk, content - ) - all_chunks.extend(micro_chunks) - - return all_chunks - - def chunk_file_with_fallback( - self, - content: str, - file_path: str, - language: str - ) -> List[HierarchicalChunk]: - """带降级策略的分词""" - - try: - return self.chunk_file(content, file_path, language) - except Exception as e: - logger.warning(f"Hierarchical chunking failed: {e}, falling back to sliding window") - # 降级到滑动窗口策略 - return self._fallback_sliding_window(content, file_path, language) -``` - -## 4. 数据存储设计 - -### 4.1 数据库Schema - -```sql --- chunk表:存储所有层级的chunk -CREATE TABLE chunks ( - chunk_id TEXT PRIMARY KEY, - parent_id TEXT, -- 父chunk ID,NULL表示顶层 - level INTEGER NOT NULL, -- 1=macro, 2=micro - chunk_type TEXT NOT NULL, -- function/class/loop/if/try等 - file_path TEXT NOT NULL, - start_line INTEGER NOT NULL, - end_line INTEGER NOT NULL, - symbol_name TEXT, - content TEXT NOT NULL, - content_hash TEXT, -- 用于检测内容变化 - - -- 语义元数据(由LLM生成) - summary TEXT, - keywords TEXT, -- JSON数组 - purpose TEXT, - - -- 向量嵌入 - embedding BLOB, -- 存储向量 - - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - - FOREIGN KEY (parent_id) REFERENCES chunks(chunk_id) ON DELETE CASCADE -); - --- 索引优化 -CREATE INDEX idx_chunks_file_path ON chunks(file_path); -CREATE INDEX idx_chunks_parent_id ON chunks(parent_id); -CREATE INDEX idx_chunks_level ON chunks(level); -CREATE INDEX idx_chunks_symbol_name ON chunks(symbol_name); -``` - -### 4.2 向量索引 - -使用分层索引策略: - -```python -class HierarchicalVectorStore: - """层级化向量存储""" - - def __init__(self, db_path: Path): - self.db_path = db_path - self.conn = sqlite3.connect(db_path) - - def add_chunk(self, chunk: HierarchicalChunk): - """添加chunk及其向量""" - - cursor = self.conn.cursor() - cursor.execute(""" - INSERT INTO chunks ( - chunk_id, parent_id, level, chunk_type, - file_path, start_line, end_line, symbol_name, - content, embedding - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, ( - chunk.metadata.chunk_id, - chunk.metadata.parent_id, - chunk.metadata.level, - chunk.metadata.chunk_type, - chunk.metadata.file_path, - chunk.metadata.start_line, - chunk.metadata.end_line, - chunk.metadata.symbol_name, - chunk.content, - self._serialize_embedding(chunk.embedding), - )) - - self.conn.commit() - - def search_hierarchical( - self, - query_embedding: List[float], - top_k: int = 10, - level_weights: Dict[int, float] = None - ) -> List[Tuple[HierarchicalChunk, float]]: - """层级化检索""" - - # 默认权重:macro chunk权重更高 - if level_weights is None: - level_weights = {1: 1.0, 2: 0.8} - - # 检索所有chunk - cursor = self.conn.cursor() - cursor.execute("SELECT * FROM chunks WHERE embedding IS NOT NULL") - - results = [] - for row in cursor.fetchall(): - chunk = self._row_to_chunk(row) - similarity = self._cosine_similarity( - query_embedding, - chunk.embedding - ) - - # 根据层级应用权重 - weighted_score = similarity * level_weights.get(chunk.metadata.level, 1.0) - results.append((chunk, weighted_score)) - - # 按分数排序 - results.sort(key=lambda x: x[1], reverse=True) - return results[:top_k] - - def get_chunk_with_context( - self, - chunk_id: str - ) -> Tuple[HierarchicalChunk, Optional[HierarchicalChunk]]: - """获取chunk及其父chunk(提供上下文)""" - - cursor = self.conn.cursor() - - # 获取chunk本身 - cursor.execute("SELECT * FROM chunks WHERE chunk_id = ?", (chunk_id,)) - chunk_row = cursor.fetchone() - chunk = self._row_to_chunk(chunk_row) - - # 获取父chunk - parent = None - if chunk.metadata.parent_id: - cursor.execute( - "SELECT * FROM chunks WHERE chunk_id = ?", - (chunk.metadata.parent_id,) - ) - parent_row = cursor.fetchone() - if parent_row: - parent = self._row_to_chunk(parent_row) - - return chunk, parent -``` - -## 5. LLM集成策略 - -### 5.1 分层生成语义元数据 - -```python -class HierarchicalLLMEnhancer: - """为层级chunk生成语义元数据""" - - def enhance_hierarchical_chunks( - self, - chunks: List[HierarchicalChunk] - ) -> Dict[str, SemanticMetadata]: - """ - 分层处理策略: - 1. 先处理所有level=1的macro chunks,生成详细摘要 - 2. 再处理level=2的micro chunks,使用父chunk摘要作为上下文 - """ - - results = {} - - # 第一轮:处理macro chunks - macro_chunks = [c for c in chunks if c.metadata.level == 1] - macro_metadata = self.llm_enhancer.enhance_files([ - FileData( - path=c.metadata.chunk_id, - content=c.content, - language=self._detect_language(c.metadata.file_path) - ) - for c in macro_chunks - ]) - results.update(macro_metadata) - - # 第二轮:处理micro chunks(带父上下文) - micro_chunks = [c for c in chunks if c.metadata.level == 2] - for micro_chunk in micro_chunks: - parent_id = micro_chunk.metadata.parent_id - parent_summary = macro_metadata.get(parent_id, {}).get('summary', '') - - # 构建带上下文的prompt - enhanced_prompt = f""" -Parent Function: {micro_chunk.metadata.symbol_name} -Parent Summary: {parent_summary} - -Code Block ({micro_chunk.metadata.chunk_type}): -``` -{micro_chunk.content} -``` - -Generate a concise summary (1 sentence) and keywords for this specific code block. -""" - - metadata = self._call_llm_with_context(enhanced_prompt) - results[micro_chunk.metadata.chunk_id] = metadata - - return results -``` - -### 5.2 Prompt优化 - -针对不同层级使用不同的prompt模板: - -**Macro Chunk Prompt (Level 1)**: -``` -PURPOSE: Generate comprehensive semantic metadata for a complete function/class -TASK: -- Provide a detailed summary (2-3 sentences) covering what the code does and why -- Extract 8-12 relevant keywords including technical terms and domain concepts -- Identify the primary purpose/category -MODE: analysis - -CODE: -```{language} -{content} -``` - -OUTPUT: JSON with summary, keywords, purpose -``` - -**Micro Chunk Prompt (Level 2)**: -``` -PURPOSE: Summarize a specific logic block within a larger function -CONTEXT: -- Parent Function: {symbol_name} -- Parent Purpose: {parent_summary} - -TASK: -- Provide a brief summary (1 sentence) of this specific block's role in the parent function -- Extract 3-5 keywords specific to this block's logic -MODE: analysis - -CODE BLOCK ({chunk_type}): -```{language} -{content} -``` - -OUTPUT: JSON with summary, keywords -``` - -## 6. 检索增强 - -### 6.1 上下文扩展检索 - -```python -class ContextualSearchEngine: - """支持上下文扩展的检索引擎""" - - def search_with_context( - self, - query: str, - top_k: int = 10, - expand_context: bool = True - ) -> List[SearchResult]: - """ - 检索并自动扩展上下文 - - 如果匹配到micro chunk,自动返回其父macro chunk作为上下文 - """ - - # 生成查询向量 - query_embedding = self.embedder.embed_single(query) - - # 层级化检索 - raw_results = self.vector_store.search_hierarchical( - query_embedding, - top_k=top_k - ) - - # 扩展上下文 - enriched_results = [] - for chunk, score in raw_results: - result = SearchResult( - path=chunk.metadata.file_path, - score=score, - content=chunk.content, - start_line=chunk.metadata.start_line, - end_line=chunk.metadata.end_line, - symbol_name=chunk.metadata.symbol_name, - ) - - # 如果是micro chunk,获取父chunk作为上下文 - if expand_context and chunk.metadata.level == 2: - parent_chunk, _ = self.vector_store.get_chunk_with_context( - chunk.metadata.chunk_id - ) - if parent_chunk: - result.metadata['parent_context'] = { - 'summary': parent_chunk.metadata.context_summary, - 'symbol_name': parent_chunk.metadata.symbol_name, - 'content': parent_chunk.content, - } - - enriched_results.append(result) - - return enriched_results -``` - -## 7. 测试策略 - -### 7.1 单元测试 - -```python -import pytest -from codexlens.semantic.hierarchical_chunker import ( - HierarchicalChunker, MacroChunker, MicroChunker -) - -class TestMacroChunker: - """测试第一层分词""" - - def test_extract_functions(self): - """测试提取函数定义""" - code = ''' -def calculate_total(items): - """Calculate total price.""" - total = 0 - for item in items: - total += item.price - return total - -def apply_discount(total, discount): - """Apply discount to total.""" - return total * (1 - discount) -''' - chunker = MacroChunker() - chunks = chunker.chunk_by_symbols(code, 'test.py', 'python') - - assert len(chunks) == 2 - assert chunks[0].metadata.symbol_name == 'calculate_total' - assert chunks[1].metadata.symbol_name == 'apply_discount' - assert chunks[0].metadata.level == 1 - - def test_extract_with_decorators(self): - """测试提取带装饰器的函数""" - code = ''' -@app.route('/api/users') -@auth_required -def get_users(): - return User.query.all() -''' - chunker = MacroChunker() - chunks = chunker.chunk_by_symbols(code, 'test.py', 'python') - - assert len(chunks) == 1 - assert '@app.route' in chunks[0].content - assert '@auth_required' in chunks[0].content - -class TestMicroChunker: - """测试第二层分词""" - - def test_extract_loop_blocks(self): - """测试提取循环块""" - code = ''' -def process_items(items): - results = [] - for item in items: - if item.active: - results.append(process(item)) - return results -''' - macro_chunker = MacroChunker() - macro_chunks = macro_chunker.chunk_by_symbols(code, 'test.py', 'python') - - micro_chunker = MicroChunker() - micro_chunks = micro_chunker.chunk_logic_blocks( - macro_chunks[0], code - ) - - # 应该提取出for循环和if条件块 - assert len(micro_chunks) >= 1 - assert any(c.metadata.chunk_type == 'for_statement' for c in micro_chunks) - - def test_skip_small_functions(self): - """测试小函数跳过二次划分""" - code = ''' -def small_func(x): - return x * 2 -''' - macro_chunker = MacroChunker() - macro_chunks = macro_chunker.chunk_by_symbols(code, 'test.py', 'python') - - micro_chunker = MicroChunker() - micro_chunks = micro_chunker.chunk_logic_blocks( - macro_chunks[0], code, max_lines=10 - ) - - # 小函数不应该被二次划分 - assert len(micro_chunks) == 0 - -class TestHierarchicalChunker: - """测试完整的多层次分词""" - - def test_full_hierarchical_chunking(self): - """测试完整的层级分词流程""" - code = ''' -def complex_function(data): - """A complex function with multiple logic blocks.""" - - # Validation - if not data: - raise ValueError("Data is empty") - - # Processing - results = [] - for item in data: - try: - processed = process_item(item) - results.append(processed) - except Exception as e: - logger.error(f"Failed to process: {e}") - continue - - # Aggregation - total = sum(r.value for r in results) - return total -''' - chunker = HierarchicalChunker() - chunks = chunker.chunk_file(code, 'test.py', 'python') - - # 应该有1个macro chunk和多个micro chunks - macro_chunks = [c for c in chunks if c.metadata.level == 1] - micro_chunks = [c for c in chunks if c.metadata.level == 2] - - assert len(macro_chunks) == 1 - assert len(micro_chunks) > 0 - - # 验证父子关系 - for micro in micro_chunks: - assert micro.metadata.parent_id == macro_chunks[0].metadata.chunk_id -``` - -### 7.2 集成测试 - -```python -class TestHierarchicalIndexing: - """测试完整的索引流程""" - - def test_index_and_search(self): - """测试分层索引和检索""" - - # 1. 分词 - chunker = HierarchicalChunker() - chunks = chunker.chunk_file(sample_code, 'sample.py', 'python') - - # 2. LLM增强 - enhancer = HierarchicalLLMEnhancer() - metadata = enhancer.enhance_hierarchical_chunks(chunks) - - # 3. 向量化 - embedder = Embedder() - for chunk in chunks: - text = metadata[chunk.metadata.chunk_id].summary - chunk.embedding = embedder.embed_single(text) - - # 4. 存储 - vector_store = HierarchicalVectorStore(Path('/tmp/test.db')) - for chunk in chunks: - vector_store.add_chunk(chunk) - - # 5. 检索 - search_engine = ContextualSearchEngine(vector_store, embedder) - results = search_engine.search_with_context( - "find loop that processes items", - top_k=5 - ) - - # 验证结果 - assert len(results) > 0 - assert any(r.metadata.get('parent_context') for r in results) -``` - -## 8. 性能优化 - -### 8.1 批量处理 - -```python -class BatchHierarchicalProcessor: - """批量处理多个文件的层级分词""" - - def process_files_batch( - self, - file_paths: List[Path], - batch_size: int = 10 - ): - """批量处理,优化LLM调用""" - - all_chunks = [] - - # 1. 批量分词 - for file_path in file_paths: - content = file_path.read_text() - chunks = self.chunker.chunk_file( - content, str(file_path), self._detect_language(file_path) - ) - all_chunks.extend(chunks) - - # 2. 批量LLM增强(减少API调用) - macro_chunks = [c for c in all_chunks if c.metadata.level == 1] - for i in range(0, len(macro_chunks), batch_size): - batch = macro_chunks[i:i+batch_size] - self.enhancer.enhance_batch(batch) - - # 3. 批量向量化 - all_texts = [c.content for c in all_chunks] - embeddings = self.embedder.embed_batch(all_texts) - for chunk, embedding in zip(all_chunks, embeddings): - chunk.embedding = embedding - - # 4. 批量存储 - self.vector_store.add_chunks_batch(all_chunks) -``` - -### 8.2 增量更新 - -```python -class IncrementalIndexer: - """增量索引器:只处理变化的文件""" - - def update_file(self, file_path: Path): - """增量更新单个文件""" - - content = file_path.read_text() - content_hash = hashlib.sha256(content.encode()).hexdigest() - - # 检查文件是否变化 - cursor = self.conn.cursor() - cursor.execute(""" - SELECT content_hash FROM chunks - WHERE file_path = ? AND level = 1 - LIMIT 1 - """, (str(file_path),)) - - row = cursor.fetchone() - if row and row[0] == content_hash: - logger.info(f"File {file_path} unchanged, skipping") - return - - # 删除旧chunk - cursor.execute("DELETE FROM chunks WHERE file_path = ?", (str(file_path),)) - - # 重新索引 - chunks = self.chunker.chunk_file(content, str(file_path), 'python') - # ... 继续处理 -``` - -## 9. 潜在问题与解决方案 - -### 9.1 问题:超大函数的micro chunk过多 - -**现象**:某些遗留代码函数超过1000行,可能产生几十个micro chunks。 - -**解决方案**: -```python -class AdaptiveMicroChunker: - """自适应micro分词:根据函数大小调整策略""" - - def chunk_logic_blocks(self, macro_chunk, content): - total_lines = macro_chunk.metadata.end_line - macro_chunk.metadata.start_line - - if total_lines > 500: - # 超大函数:只提取顶层逻辑块,不递归 - return self._extract_top_level_blocks(macro_chunk, content) - elif total_lines > 100: - # 大函数:递归深度限制为2层 - return self._extract_blocks_with_depth_limit(macro_chunk, content, max_depth=2) - else: - # 正常函数:完全跳过micro chunking - return [] -``` - -### 9.2 问题:tree-sitter解析失败 - -**现象**:对于语法错误的代码,tree-sitter解析可能失败。 - -**解决方案**: -```python -def chunk_file_with_fallback(self, content, file_path, language): - """带降级策略的分词""" - - try: - # 尝试层级分词 - return self.chunk_file(content, file_path, language) - except TreeSitterError as e: - logger.warning(f"Tree-sitter parsing failed: {e}") - - # 降级到基于正则的简单symbol提取 - return self._fallback_regex_chunking(content, file_path) - except Exception as e: - logger.error(f"Chunking failed completely: {e}") - - # 最终降级到滑动窗口 - return self._fallback_sliding_window(content, file_path, language) -``` - -### 9.3 问题:向量存储空间占用 - -**现象**:每个chunk都存储向量,空间占用可能很大。 - -**解决方案**: -- **选择性向量化**:只对macro chunks和重要的micro chunks生成向量 -- **向量压缩**:使用PCA或量化技术减少向量维度 -- **分离存储**:向量存储在专门的向量数据库(如Faiss),SQLite只存元数据 - -```python -class SelectiveVectorization: - """选择性向量化:减少存储开销""" - - VECTORIZE_CHUNK_TYPES = { - 'function_definition', # 总是向量化 - 'class_definition', # 总是向量化 - 'for_statement', # 循环块 - 'try_statement', # 异常处理 - # 'if_statement' 通常不单独向量化,依赖父chunk - } - - def should_vectorize(self, chunk: HierarchicalChunk) -> bool: - """判断是否需要为chunk生成向量""" - - # Level 1总是向量化 - if chunk.metadata.level == 1: - return True - - # Level 2根据类型和大小决定 - if chunk.metadata.chunk_type not in self.VECTORIZE_CHUNK_TYPES: - return False - - # 太小的块(<5行)不向量化 - lines = chunk.metadata.end_line - chunk.metadata.start_line - if lines < 5: - return False - - return True -``` - -## 10. 实施路线图 - -### Phase 1: 基础架构(2-3周) -- [x] 设计数据结构(HierarchicalChunk, ChunkMetadata) -- [ ] 实现MacroChunker(复用现有code_extractor) -- [ ] 实现基础的MicroChunker -- [ ] 数据库schema设计和migration -- [ ] 单元测试 - -### Phase 2: LLM集成(1-2周) -- [ ] 实现HierarchicalLLMEnhancer -- [ ] 设计分层prompt模板 -- [ ] 批量处理优化 -- [ ] 集成测试 - -### Phase 3: 向量化与检索(1-2周) -- [ ] 实现HierarchicalVectorStore -- [ ] 实现ContextualSearchEngine -- [ ] 上下文扩展逻辑 -- [ ] 检索性能测试 - -### Phase 4: 优化与完善(2周) -- [ ] 性能优化(批量处理、增量更新) -- [ ] 降级策略完善 -- [ ] 选择性向量化 -- [ ] 全面测试和文档 - -### Phase 5: 生产部署(1周) -- [ ] CLI集成 -- [ ] 配置选项暴露 -- [ ] 生产环境测试 -- [ ] 发布 - -**总计预估时间**:7-10周 - -## 11. 成功指标 - -1. **覆盖率**:95%以上的代码能被正确分词 -2. **准确率**:层级关系准确率>98% -3. **检索质量**:相比单层分词,检索相关性提升30%+ -4. **性能**:单文件分词<100ms,批量处理>100文件/分钟 -5. **存储效率**:相比全向量化,空间占用减少40%+ - -## 12. 参考资料 - -- [Tree-sitter Documentation](https://tree-sitter.github.io/) -- [AST-based Code Analysis](https://en.wikipedia.org/wiki/Abstract_syntax_tree) -- [Hierarchical Text Segmentation](https://arxiv.org/abs/2104.08836) -- 现有代码:`src/codexlens/semantic/chunker.py` diff --git a/codex-lens/docs/PURE_VECTOR_SEARCH_GUIDE.md b/codex-lens/docs/PURE_VECTOR_SEARCH_GUIDE.md deleted file mode 100644 index e4c54f78..00000000 --- a/codex-lens/docs/PURE_VECTOR_SEARCH_GUIDE.md +++ /dev/null @@ -1,417 +0,0 @@ -# Pure Vector Search 使用指南 - -## 概述 - -CodexLens 现在支持纯向量语义搜索!这是一个重要的新功能,允许您使用自然语言查询代码。 - -### 新增搜索模式 - -| 模式 | 描述 | 最佳用途 | 需要嵌入 | -|------|------|----------|---------| -| `exact` | 精确FTS匹配 | 代码标识符搜索 | ✗ | -| `fuzzy` | 模糊FTS匹配 | 容错搜索 | ✗ | -| `vector` | 向量 + FTS后备 | 语义 + 关键词混合 | ✓ | -| **`pure-vector`** | **纯向量搜索** | **纯自然语言查询** | **✓** | -| `hybrid` | 全部融合(RRF) | 最佳召回率 | ✓ | - -### 关键变化 - -**之前**: -```bash -# "vector"模式实际上总是包含exact FTS搜索 -codexlens search "authentication" --mode vector -# 即使没有嵌入,也会返回FTS结果 -``` - -**现在**: -```bash -# "vector"模式仍保持向量+FTS混合(向后兼容) -codexlens search "authentication" --mode vector - -# 新的"pure-vector"模式:仅使用向量搜索 -codexlens search "how to authenticate users" --mode pure-vector -# 没有嵌入时返回空列表(明确行为) -``` - -## 快速开始 - -### 步骤1:安装语义搜索依赖 - -```bash -# 方式1:使用可选依赖 -pip install codexlens[semantic] - -# 方式2:手动安装 -pip install fastembed numpy -``` - -### 步骤2:创建索引(如果还没有) - -```bash -# 为项目创建索引 -codexlens init ~/projects/your-project -``` - -### 步骤3:生成向量嵌入 - -```bash -# 为项目生成嵌入(自动查找索引) -codexlens embeddings-generate ~/projects/your-project - -# 为特定索引生成嵌入 -codexlens embeddings-generate ~/.codexlens/indexes/your-project/_index.db - -# 使用特定模型 -codexlens embeddings-generate ~/projects/your-project --model fast - -# 强制重新生成 -codexlens embeddings-generate ~/projects/your-project --force - -# 检查嵌入状态 -codexlens embeddings-status # 检查所有索引 -codexlens embeddings-status ~/projects/your-project # 检查特定项目 -``` - -**可用模型**: -- `fast`: BAAI/bge-small-en-v1.5 (384维, ~80MB) - 快速,轻量级 -- `code`: jinaai/jina-embeddings-v2-base-code (768维, ~150MB) - **代码优化**(推荐,默认) -- `multilingual`: intfloat/multilingual-e5-large (1024维, ~1GB) - 多语言 -- `balanced`: mixedbread-ai/mxbai-embed-large-v1 (1024维, ~600MB) - 高精度 - -### 步骤4:使用纯向量搜索 - -```bash -# 纯向量搜索(自然语言) -codexlens search "how to verify user credentials" --mode pure-vector - -# 向量搜索(带FTS后备) -codexlens search "authentication logic" --mode vector - -# 混合搜索(最佳效果) -codexlens search "user login" --mode hybrid - -# 精确代码搜索 -codexlens search "authenticate_user" --mode exact -``` - -## 使用场景 - -### 场景1:查找实现特定功能的代码 - -**问题**:"我如何在这个项目中处理用户身份验证?" - -```bash -codexlens search "verify user credentials and authenticate" --mode pure-vector -``` - -**优势**:理解查询意图,找到语义相关的代码,而不仅仅是关键词匹配。 - -### 场景2:查找类似的代码模式 - -**问题**:"项目中哪些地方使用了密码哈希?" - -```bash -codexlens search "password hashing with salt" --mode pure-vector -``` - -**优势**:找到即使没有包含"hash"或"password"关键词的相关代码。 - -### 场景3:探索性搜索 - -**问题**:"如何在这个项目中连接数据库?" - -```bash -codexlens search "database connection and initialization" --mode pure-vector -``` - -**优势**:发现相关代码,即使使用了不同的术语(如"DB"、"connection pool"、"session")。 - -### 场景4:混合搜索获得最佳效果 - -**问题**:既要关键词匹配,又要语义理解 - -```bash -# 最佳实践:使用hybrid模式 -codexlens search "authentication" --mode hybrid -``` - -**优势**:结合FTS的精确性和向量搜索的语义理解。 - -## 故障排除 - -### 问题1:纯向量搜索返回空结果 - -**原因**:未生成向量嵌入 - -**解决方案**: -```bash -# 检查嵌入状态 -codexlens embeddings-status ~/projects/your-project - -# 生成嵌入 -codexlens embeddings-generate ~/projects/your-project - -# 或者对特定索引 -codexlens embeddings-generate ~/.codexlens/indexes/your-project/_index.db -``` - -### 问题2:ImportError: fastembed not found - -**原因**:未安装语义搜索依赖 - -**解决方案**: -```bash -pip install codexlens[semantic] -``` - -### 问题3:嵌入生成失败 - -**原因**:模型下载失败或磁盘空间不足 - -**解决方案**: -```bash -# 使用更小的模型 -codexlens embeddings-generate ~/projects/your-project --model fast - -# 检查磁盘空间(模型需要~100MB) -df -h ~/.cache/fastembed -``` - -### 问题4:搜索速度慢 - -**原因**:向量搜索比FTS慢(需要计算余弦相似度) - -**优化**: -- 使用`--limit`限制结果数量 -- 考虑使用`vector`模式(带FTS后备)而不是`pure-vector` -- 对于精确标识符搜索,使用`exact`模式 - -## 性能对比 - -基于测试数据(100个文件,~500个代码块): - -| 模式 | 平均延迟 | 召回率 | 精确率 | -|------|---------|--------|--------| -| exact | 5.6ms | 中 | 高 | -| fuzzy | 7.7ms | 高 | 中 | -| vector | 7.4ms | 高 | 中 | -| **pure-vector** | **7.0ms** | **最高** | **中** | -| hybrid | 9.0ms | 最高 | 高 | - -**结论**: -- `exact`: 最快,适合代码标识符 -- `pure-vector`: 与vector类似速度,更明确的语义搜索 -- `hybrid`: 轻微开销,但召回率和精确率最佳 - -## 最佳实践 - -### 1. 选择合适的搜索模式 - -```bash -# 查找函数名/类名/变量名 → exact -codexlens search "UserAuthentication" --mode exact - -# 自然语言问题 → pure-vector -codexlens search "how to hash passwords securely" --mode pure-vector - -# 不确定用哪个 → hybrid -codexlens search "password security" --mode hybrid -``` - -### 2. 优化查询 - -**不好的查询**(对向量搜索): -```bash -codexlens search "auth" --mode pure-vector # 太模糊 -``` - -**好的查询**: -```bash -codexlens search "authenticate user with username and password" --mode pure-vector -``` - -**原则**: -- 使用完整句子描述意图 -- 包含关键动词和名词 -- 避免过于简短或模糊的查询 - -### 3. 定期更新嵌入 - -```bash -# 当代码更新后,重新生成嵌入 -codexlens embeddings-generate ~/projects/your-project --force -``` - -### 4. 监控嵌入存储空间 - -```bash -# 检查嵌入数据大小 -du -sh ~/.codexlens/indexes/*/ - -# 嵌入通常占用索引大小的2-3倍 -# 100个文件 → ~500个chunks → ~1.5MB (768维向量) -``` - -## API 使用示例 - -### Python API - -```python -from pathlib import Path -from codexlens.search.hybrid_search import HybridSearchEngine - -# 初始化引擎 -engine = HybridSearchEngine() - -# 纯向量搜索 -results = engine.search( - index_path=Path("~/.codexlens/indexes/project/_index.db"), - query="how to authenticate users", - limit=10, - enable_vector=True, - pure_vector=True, # 纯向量模式 -) - -for result in results: - print(f"{result.path}: {result.score:.3f}") - print(f" {result.excerpt}") - -# 向量搜索(带FTS后备) -results = engine.search( - index_path=Path("~/.codexlens/indexes/project/_index.db"), - query="authentication", - limit=10, - enable_vector=True, - pure_vector=False, # 允许FTS后备 -) -``` - -### 链式搜索API - -```python -from codexlens.search.chain_search import ChainSearchEngine, SearchOptions -from codexlens.storage.registry import RegistryStore -from codexlens.storage.path_mapper import PathMapper - -# 初始化 -registry = RegistryStore() -registry.initialize() -mapper = PathMapper() -engine = ChainSearchEngine(registry, mapper) - -# 配置搜索选项 -options = SearchOptions( - depth=-1, # 无限深度 - total_limit=20, - hybrid_mode=True, - enable_vector=True, - pure_vector=True, # 纯向量搜索 -) - -# 执行搜索 -result = engine.search( - query="verify user credentials", - source_path=Path("~/projects/my-app"), - options=options -) - -print(f"Found {len(result.results)} results in {result.stats.time_ms:.1f}ms") -``` - -## 技术细节 - -### 向量存储架构 - -``` -_index.db (SQLite) -├── files # 文件索引表 -├── files_fts # FTS5全文索引 -├── files_fts_fuzzy # 模糊搜索索引 -└── semantic_chunks # 向量嵌入表 ✓ 新增 - ├── id - ├── file_path - ├── content # 代码块内容 - ├── embedding # 向量嵌入(BLOB, float32) - ├── metadata # JSON元数据 - └── created_at -``` - -### 向量搜索流程 - -``` -1. 查询嵌入化 - └─ query → Embedder → query_embedding (768维向量) - -2. 相似度计算 - └─ VectorStore.search_similar() - ├─ 加载embedding matrix到内存 - ├─ NumPy向量化余弦相似度计算 - └─ Top-K选择 - -3. 结果返回 - └─ SearchResult对象列表 - ├─ path: 文件路径 - ├─ score: 相似度分数 - ├─ excerpt: 代码片段 - └─ metadata: 元数据 -``` - -### RRF融合算法 - -混合模式使用Reciprocal Rank Fusion (RRF): - -```python -# 默认权重 -weights = { - "exact": 0.4, # 40% 精确FTS - "fuzzy": 0.3, # 30% 模糊FTS - "vector": 0.3, # 30% 向量搜索 -} - -# RRF公式 -score(doc) = Σ weight[source] / (k + rank[source]) -k = 60 # RRF常数 -``` - -## 未来改进 - -- [ ] 增量嵌入更新(当前需要完全重新生成) -- [ ] 混合分块策略(symbol-based + sliding window) -- [ ] FAISS加速(100x+速度提升) -- [ ] 向量压缩(减少50%存储空间) -- [ ] 查询扩展(同义词、相关术语) -- [ ] 多模态搜索(代码 + 文档 + 注释) - -## 相关资源 - -- **实现文件**: - - `codexlens/search/hybrid_search.py` - 混合搜索引擎 - - `codexlens/semantic/embedder.py` - 嵌入生成 - - `codexlens/semantic/vector_store.py` - 向量存储 - - `codexlens/semantic/chunker.py` - 代码分块 - -- **测试文件**: - - `tests/test_pure_vector_search.py` - 纯向量搜索测试 - - `tests/test_search_comparison.py` - 搜索模式对比 - -- **文档**: - - `SEARCH_COMPARISON_ANALYSIS.md` - 详细技术分析 - - `SEARCH_ANALYSIS_SUMMARY.md` - 快速总结 - -## 反馈和贡献 - -如果您发现问题或有改进建议,请提交issue或PR: -- GitHub: https://github.com/your-org/codexlens - -## 更新日志 - -### v0.5.0 (2025-12-16) -- ✨ 新增 `pure-vector` 搜索模式 -- ✨ 添加向量嵌入生成脚本 -- 🔧 修复"vector"模式总是包含exact FTS的问题 -- 📚 更新文档和使用指南 -- ✅ 添加纯向量搜索测试套件 - ---- - -**问题?** 查看 [故障排除](#故障排除) 章节或提交issue。 diff --git a/codex-lens/docs/REAL_LSP_SERVER_PLAN.md b/codex-lens/docs/REAL_LSP_SERVER_PLAN.md deleted file mode 100644 index 015eb926..00000000 --- a/codex-lens/docs/REAL_LSP_SERVER_PLAN.md +++ /dev/null @@ -1,825 +0,0 @@ -# CodexLens Real LSP Server Implementation Plan - -> **Version**: 2.0 -> **Status**: Ready for Implementation -> **Based on**: Existing LSP_INTEGRATION_PLAN.md + Real Language Server Integration -> **Goal**: Implement true LSP server functionality (like cclsp), not pre-indexed search - ---- - -## Executive Summary - -### Current State vs Target State - -| Aspect | Current (Pre-indexed) | Target (Real LSP) | -|--------|----------------------|-------------------| -| **Data Source** | Cached database index | Live language servers | -| **Freshness** | Stale (depends on re-index) | Real-time (LSP protocol) | -| **Accuracy** | Good for indexed content | Perfect (from language server) | -| **Latency** | <50ms (database) | ~50-200ms (LSP) | -| **Language Support** | Limited to parsed symbols | Full LSP support (all languages) | -| **Complexity** | Simple (DB queries) | High (LSP protocol + server mgmt) | - -### Why Real LSP vs Index-Based - -**Problem with current approach**: -- 符号搜索与smart_search没有本质区别 -- 依赖预索引数据,不能实时反映代码变化 -- 不支持advanced LSP功能(rename, code actions等) - -**Advantages of real LSP**: -- ✅ Real-time code intelligence -- ✅ Supported by all major IDEs (VSCode, Neovim, Sublime, etc.) -- ✅ Standard protocol (Language Server Protocol) -- ✅ Advanced features: rename, code actions, formatting -- ✅ Language-agnostic (TypeScript, Python, Go, Rust, Java, etc.) - ---- - -## Architecture Design - -### System Architecture - -``` -┌─────────────────────────────────────────────────────────┐ -│ Client Layer │ -│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ -│ │ VS Code │ │ Neovim │ │ Sublime │ │ -│ │ (LSP Client) │ │ (LSP Client) │ │ (LSP Client) │ │ -│ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │ -│ │ │ │ │ -└─────────┼─────────────────┼─────────────────┼───────────┘ - │ LSP Protocol │ │ - │ (JSON-RPC/stdio)│ │ -┌─────────▼─────────────────▼─────────────────▼───────────┐ -│ CodexLens LSP Server Bridge │ -│ ┌─────────────────────────────────────────────────────┐ │ -│ │ LSP Protocol Handler (pygls) │ │ -│ │ • initialize / shutdown │ │ -│ │ • textDocument/definition │ │ -│ │ • textDocument/references │ │ -│ │ • textDocument/hover │ │ -│ │ • textDocument/completion │ │ -│ │ • textDocument/formatting │ │ -│ │ • workspace/symbol │ │ -│ └────────────────────┬────────────────────────────────┘ │ -│ │ │ -│ ┌────────────────────▼────────────────────────────────┐ │ -│ │ Language Server Multiplexer │ │ -│ │ • File type routing (ts→tsserver, py→pylsp, etc.) │ │ -│ │ • Multi-server management │ │ -│ │ • Request forwarding & response formatting │ │ -│ └────────────────────┬────────────────────────────────┘ │ -│ │ │ -│ ┌────────────────────▼────────────────────────────────┐ │ -│ │ Language Servers (Spawned) │ │ -│ │ ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ │ │ -│ │ │tsserver│ │ pylsp │ │ gopls │ │rust- │ │ │ -│ │ │ │ │ │ │ │ │analyzer│ │ │ -│ │ └────────┘ └────────┘ └────────┘ └────────┘ │ │ -│ └─────────────────────────────────────────────────────┘ │ -│ │ -│ ┌─────────────────────────────────────────────────────┐ │ -│ │ Codex-Lens Core (Optional - MCP Layer) │ │ -│ │ • Semantic search │ │ -│ │ • Custom MCP tools (enrich_prompt, etc.) │ │ -│ │ • Hook system (pre-tool, post-tool) │ │ -│ └─────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────┘ -``` - -### Key Differences from Index-Based Approach - -1. **Request Flow** - - Index: Query → Database → Results - - LSP: Request → Route to LS → LS processes live code → Results - -2. **Configuration** - - Index: Depends on indexing state - - LSP: Depends on installed language servers - -3. **Latency Profile** - - Index: Consistent (~50ms) - - LSP: Variable (50-500ms depending on LS performance) - ---- - -## Implementation Phases - -### Phase 1: LSP Server Bridge (Foundation) - -**Duration**: ~3-5 days -**Complexity**: Medium -**Dependencies**: pygls library - -#### 1.1 Setup & Dependencies - -**File**: `pyproject.toml` - -```toml -[project.optional-dependencies] -lsp = [ - "pygls>=1.3.0", - "lsprotocol>=2023.0.0", -] - -[project.scripts] -codexlens-lsp = "codexlens.lsp.server:main" -``` - -**Installation**: -```bash -pip install -e ".[lsp]" -``` - -#### 1.2 LSP Server Core - -**Files to create**: -1. `src/codexlens/lsp/__init__.py` - Package init -2. `src/codexlens/lsp/server.py` - Server entry point -3. `src/codexlens/lsp/multiplexer.py` - LS routing & management -4. `src/codexlens/lsp/handlers.py` - LSP request handlers - -**Key responsibilities**: -- Initialize LSP server via pygls -- Handle client capabilities negotiation -- Route requests to appropriate language servers -- Format language server responses to LSP format - -#### 1.3 Acceptance Criteria - -- [ ] Server starts with `codexlens-lsp --stdio` -- [ ] Responds to `initialize` request -- [ ] Spawns language servers on demand -- [ ] Handles `shutdown` cleanly -- [ ] No crashes on malformed requests - ---- - -### Phase 2: Language Server Multiplexer - -**Duration**: ~5-7 days -**Complexity**: High -**Dependencies**: Phase 1 complete - -#### 2.1 Multi-Server Management - -**File**: `src/codexlens/lsp/multiplexer.py` - -**Responsibilities**: -- Spawn language servers based on file extension -- Maintain server process lifecycle -- Route requests by document type -- Handle server crashes & restarts - -**Supported Language Servers**: - -| Language | Server | Installation | -|----------|--------|--------------| -| TypeScript | `typescript-language-server` | `npm i -g typescript-language-server` | -| Python | `pylsp` | `pip install python-lsp-server` | -| Go | `gopls` | `go install golang.org/x/tools/gopls@latest` | -| Rust | `rust-analyzer` | `rustup component add rust-analyzer` | -| Java | `jdtls` | Download JDTLS | -| C/C++ | `clangd` | `apt install clangd` | - -#### 2.2 Configuration - -**File**: `codexlens-lsp.json` (user config) - -```json -{ - "languageServers": { - "typescript": { - "command": ["typescript-language-server", "--stdio"], - "extensions": ["ts", "tsx", "js", "jsx"], - "rootDir": "." - }, - "python": { - "command": ["pylsp"], - "extensions": ["py", "pyi"], - "rootDir": ".", - "settings": { - "pylsp": { - "plugins": { - "pycodestyle": { "enabled": true }, - "pylint": { "enabled": false } - } - } - } - }, - "go": { - "command": ["gopls"], - "extensions": ["go"], - "rootDir": "." - }, - "rust": { - "command": ["rust-analyzer"], - "extensions": ["rs"], - "rootDir": "." - } - }, - "debug": false, - "logLevel": "info" -} -``` - -#### 2.3 Acceptance Criteria - -- [ ] Routes requests to correct LS based on file type -- [ ] Spawns servers on first request -- [ ] Reuses existing server instances -- [ ] Handles server restarts on crash -- [ ] Respects initialization options from config - ---- - -### Phase 3: Core LSP Handlers - -**Duration**: ~5-7 days -**Complexity**: Medium -**Dependencies**: Phase 1-2 complete - -#### 3.1 Essential Handlers - -Implement LSP request handlers for core functionality: - -**Handler Mapping**: - -```python -Handlers = { - # Navigation - "textDocument/definition": handle_definition, - "textDocument/references": handle_references, - "textDocument/declaration": handle_declaration, - - # Hover & Info - "textDocument/hover": handle_hover, - "textDocument/signatureHelp": handle_signature_help, - - # Completion - "textDocument/completion": handle_completion, - "completionItem/resolve": handle_completion_resolve, - - # Symbols - "textDocument/documentSymbol": handle_document_symbols, - "workspace/symbol": handle_workspace_symbols, - - # Editing - "textDocument/formatting": handle_formatting, - "textDocument/rangeFormatting": handle_range_formatting, - "textDocument/rename": handle_rename, - - # Diagnostics - "textDocument/publishDiagnostics": handle_publish_diagnostics, - - # Misc - "textDocument/codeAction": handle_code_action, - "textDocument/codeLens": handle_code_lens, -} -``` - -#### 3.2 Request Forwarding Logic - -```python -def forward_request_to_lsp(handler_name, params): - """Forward request to appropriate language server.""" - - # Extract document info - document_uri = params.get("textDocument", {}).get("uri") - file_ext = extract_extension(document_uri) - - # Get language server - ls = multiplexer.get_server(file_ext) - if not ls: - return {"error": f"No LS for {file_ext}"} - - # Convert position (1-based → 0-based) - normalized_params = normalize_positions(params) - - # Forward to LS - response = ls.send_request(handler_name, normalized_params) - - # Convert response format - return normalize_response(response) -``` - -#### 3.3 Acceptance Criteria - -- [ ] All handlers implemented and tested -- [ ] Proper position coordinate conversion (LSP is 0-based, user-facing is 1-based) -- [ ] Error handling for missing language servers -- [ ] Response formatting matches LSP spec -- [ ] Latency < 500ms for 95th percentile - ---- - -### Phase 4: Advanced Features - -**Duration**: ~3-5 days -**Complexity**: Medium -**Dependencies**: Phase 1-3 complete - -#### 4.1 Position Tolerance (cclsp-like feature) - -Some LSP clients (like Claude Code with fuzzy positions) may send imprecise positions. Implement retry logic: - -```python -def find_symbol_with_tolerance(ls, uri, position, max_attempts=5): - """Try multiple position offsets if exact position fails.""" - - positions_to_try = [ - position, # Original - (position.line - 1, position.char), # One line up - (position.line + 1, position.char), # One line down - (position.line, max(0, position.char - 1)), # One char left - (position.line, position.char + 1), # One char right - ] - - for pos in positions_to_try: - try: - result = ls.send_request("textDocument/definition", { - "textDocument": {"uri": uri}, - "position": pos - }) - if result: - return result - except: - continue - - return None -``` - -#### 4.2 MCP Integration (Optional) - -Extend with MCP provider for Claude Code hooks: - -```python -class MCPBridgeHandler: - """Bridge LSP results into MCP context.""" - - def build_mcp_context_from_lsp(self, symbol_name, lsp_results): - """Convert LSP responses to MCP context.""" - # Implementation - pass -``` - -#### 4.3 Acceptance Criteria - -- [ ] Position tolerance working (≥3 positions tried) -- [ ] MCP context generation functional -- [ ] Hook system integration complete -- [ ] All test coverage > 80% - ---- - -### Phase 5: Deployment & Documentation - -**Duration**: ~2-3 days -**Complexity**: Low -**Dependencies**: Phase 1-4 complete - -#### 5.1 Installation & Setup Guide - -Create comprehensive documentation: -- Installation instructions for each supported language -- Configuration guide -- Troubleshooting -- Performance tuning - -#### 5.2 CLI Tools - -```bash -# Start LSP server -codexlens-lsp --stdio - -# Check configured language servers -codexlens-lsp --list-servers - -# Validate configuration -codexlens-lsp --validate-config - -# Show logs -codexlens-lsp --log-level debug -``` - -#### 5.3 Acceptance Criteria - -- [ ] Documentation complete with examples -- [ ] All CLI commands working -- [ ] Integration tested with VS Code, Neovim -- [ ] Performance benchmarks documented - ---- - -## Module Structure - -``` -src/codexlens/lsp/ -├── __init__.py # Package exports -├── server.py # LSP server entry point -├── multiplexer.py # Language server manager -├── handlers.py # LSP request handlers -├── position_utils.py # Coordinate conversion utilities -├── process_manager.py # Language server process lifecycle -├── response_formatter.py # LSP response formatting -└── config.py # Configuration loading - -tests/lsp/ -├── test_multiplexer.py # LS routing tests -├── test_handlers.py # Handler tests -├── test_position_conversion.py # Coordinate tests -├── test_integration.py # Full LSP handshake -└── fixtures/ - ├── sample_python.py # Test files - └── sample_typescript.ts -``` - ---- - -## Dependency Graph - -``` -Phase 5 (Deployment) - ↑ -Phase 4 (Advanced Features) - ↑ -Phase 3 (Core Handlers) - ├─ Depends on: Phase 2 - ├─ Depends on: Phase 1 - └─ Deliverable: Full LSP functionality - -Phase 2 (Multiplexer) - ├─ Depends on: Phase 1 - └─ Deliverable: Multi-server routing - -Phase 1 (Server Bridge) - └─ Deliverable: Basic LSP server -``` - ---- - -## Technology Stack - -| Component | Technology | Rationale | -|-----------|-----------|-----------| -| LSP Implementation | `pygls` | Mature, well-maintained | -| Protocol | LSP 3.17+ | Latest stable version | -| Process Management | `subprocess` + `psutil` | Standard Python, no external deps | -| Configuration | JSON | Simple, widely understood | -| Logging | `logging` module | Built-in, standard | -| Testing | `pytest` + `pytest-asyncio` | Industry standard | - ---- - -## Risk Assessment - -| Risk | Probability | Impact | Mitigation | -|------|-------------|--------|------------| -| Language server crashes | Medium | High | Auto-restart with exponential backoff | -| Configuration errors | Medium | Medium | Validation on startup | -| Performance degradation | Low | High | Implement caching + benchmarks | -| Position mismatch issues | Medium | Low | Tolerance layer (try multiple positions) | -| Memory leaks (long sessions) | Low | Medium | Connection pooling + cleanup timers | - ---- - -## Success Metrics - -1. **Functionality**: All 7 core LSP handlers working -2. **Performance**: p95 latency < 500ms for typical requests -3. **Reliability**: 99.9% uptime in production -4. **Coverage**: >80% code coverage -5. **Documentation**: Complete with examples -6. **Multi-language**: Support for 5+ languages - ---- - -## Comparison: This Approach vs Alternatives - -### Option A: Real LSP Server (This Plan) ✅ RECOMMENDED -**Pros**: -- ✅ True real-time code intelligence -- ✅ Supports all LSP clients (VSCode, Neovim, Sublime, Emacs, etc.) -- ✅ Advanced features (rename, code actions, formatting) -- ✅ Language-agnostic -- ✅ Follows industry standard protocol - -**Cons**: -- ❌ More complex implementation -- ❌ Depends on external language servers -- ❌ Higher latency than index-based - -**Effort**: ~20-25 days - ---- - -### Option B: Enhanced Index-Based (Current Approach) -**Pros**: -- ✅ Simple implementation -- ✅ Fast (<50ms) -- ✅ No external dependencies - -**Cons**: -- ❌ Same as smart_search (user's concern) -- ❌ Stale data between re-indexes -- ❌ Limited to indexed symbols -- ❌ No advanced LSP features - -**Effort**: ~5-10 days - ---- - -### Option C: Hybrid (LSP + Index) -**Pros**: -- ✅ Real-time from LSP -- ✅ Fallback to index -- ✅ Best of both worlds - -**Cons**: -- ❌ Highest complexity -- ❌ Difficult to debug conflicts -- ❌ Higher maintenance burden - -**Effort**: ~30-35 days - ---- - -## Next Steps - -1. **Approve Plan**: Confirm this approach matches requirements -2. **Setup Dev Environment**: Install language servers -3. **Phase 1 Implementation**: Start with server bridge -4. **Iterative Testing**: Test each phase with real IDE integration -5. **Documentation**: Maintain docs as implementation progresses - ---- - ---- - -## Appendix A: VSCode Bridge Implementation - -### A.1 Overview - -VSCode Bridge 是另一种集成方式,通过VSCode扩展暴露其内置LSP功能给外部工具(如CCW MCP Server)。 - -**Architecture**: - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ Claude Code / CCW │ -│ (MCP Client / CLI) │ -└───────────────────────────┬─────────────────────────────────────┘ - │ - │ MCP Tool Call (vscode_lsp) - │ -┌───────────────────────────▼─────────────────────────────────────┐ -│ CCW MCP Server │ -│ ┌─────────────────────────────────────────────────────────────┐ │ -│ │ vscode_lsp Tool │ │ -│ │ • HTTP client to VSCode Bridge │ │ -│ │ • Parameter validation (Zod) │ │ -│ │ • Response formatting │ │ -│ └────────────────────────┬────────────────────────────────────┘ │ -└───────────────────────────┼─────────────────────────────────────┘ - │ - │ HTTP POST (localhost:3457) - │ -┌───────────────────────────▼─────────────────────────────────────┐ -│ ccw-vscode-bridge Extension │ -│ ┌─────────────────────────────────────────────────────────────┐ │ -│ │ HTTP Server (port 3457) │ │ -│ │ Endpoints: │ │ -│ │ • POST /get_definition │ │ -│ │ • POST /get_references │ │ -│ │ • POST /get_hover │ │ -│ │ • POST /get_document_symbols │ │ -│ └────────────────────────┬────────────────────────────────────┘ │ -│ │ │ -│ ┌────────────────────────▼────────────────────────────────────┐ │ -│ │ VSCode API Calls │ │ -│ │ vscode.commands.executeCommand(): │ │ -│ │ • vscode.executeDefinitionProvider │ │ -│ │ • vscode.executeReferenceProvider │ │ -│ │ • vscode.executeHoverProvider │ │ -│ │ • vscode.executeDocumentSymbolProvider │ │ -│ └─────────────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ - │ - │ VSCode LSP Integration - │ -┌───────────────────────────▼─────────────────────────────────────┐ -│ VSCode Language Services │ -│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ -│ │TypeScript│ │ Python │ │ Go │ │ Rust │ │ -│ │ Server │ │ Server │ │ (gopls) │ │Analyzer │ │ -│ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ -└─────────────────────────────────────────────────────────────────┘ -``` - -### A.2 Component Files - -**已创建的文件**: - -1. `ccw-vscode-bridge/package.json` - VSCode扩展配置 -2. `ccw-vscode-bridge/tsconfig.json` - TypeScript配置 -3. `ccw-vscode-bridge/src/extension.ts` - 扩展主代码 -4. `ccw-vscode-bridge/.vscodeignore` - 打包排除文件 -5. `ccw-vscode-bridge/README.md` - 使用文档 - -**待创建的文件**: - -1. `ccw/src/tools/vscode-lsp.ts` - MCP工具实现 -2. `ccw/src/tools/index.ts` - 注册新工具 - -### A.3 VSCode Bridge Extension Implementation - -**File**: `ccw-vscode-bridge/src/extension.ts` - -```typescript -// 核心功能: -// 1. 启动HTTP服务器监听3457端口 -// 2. 接收POST请求,解析JSON body -// 3. 调用VSCode内置LSP命令 -// 4. 返回JSON结果 - -// HTTP Endpoints: -// POST /get_definition → vscode.executeDefinitionProvider -// POST /get_references → vscode.executeReferenceProvider -// POST /get_hover → vscode.executeHoverProvider -// POST /get_document_symbols → vscode.executeDocumentSymbolProvider -``` - -### A.4 MCP Tool Implementation - -**File**: `ccw/src/tools/vscode-lsp.ts` - -```typescript -/** - * MCP tool that communicates with VSCode Bridge extension. - * - * Actions: - * - get_definition: Find symbol definition - * - get_references: Find all references - * - get_hover: Get hover information - * - get_document_symbols: List symbols in file - * - * Required: - * - ccw-vscode-bridge extension running in VSCode - * - File must be open in VSCode for accurate results - */ - -const schema: ToolSchema = { - name: 'vscode_lsp', - description: `Access live VSCode LSP features...`, - inputSchema: { - type: 'object', - properties: { - action: { type: 'string', enum: [...] }, - file_path: { type: 'string' }, - line: { type: 'number' }, - character: { type: 'number' } - }, - required: ['action', 'file_path'] - } -}; -``` - -### A.5 Advantages vs Standalone LSP Server - -| Feature | VSCode Bridge | Standalone LSP Server | -|---------|--------------|----------------------| -| **Setup Complexity** | Low (VSCode ext) | Medium (multiple LS) | -| **Language Support** | Automatic (VSCode) | Manual config | -| **Maintenance** | Low | Medium | -| **IDE Independence** | VSCode only | Any LSP client | -| **Performance** | Good | Good | -| **Advanced Features** | Full VSCode support | LSP standard | - ---- - -## Appendix B: Complete Integration Architecture - -### B.1 Three Integration Paths - -``` -┌─────────────────────────────────────────────────────────────────────────────┐ -│ CodexLens Integration Paths │ -├─────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ Path 1: VSCode Bridge (HTTP) Path 2: Standalone LSP Server │ -│ ──────────────────────── ───────────────────────────── │ -│ │ -│ ┌─────────────┐ ┌─────────────┐ │ -│ │ CCW MCP │ │ Any LSP │ │ -│ │ vscode_lsp │ │ Client │ │ -│ └──────┬──────┘ └──────┬──────┘ │ -│ │ HTTP │ LSP/stdio │ -│ ▼ ▼ │ -│ ┌─────────────┐ ┌─────────────┐ │ -│ │ ccw-vscode │ │ codexlens- │ │ -│ │ -bridge │ │ lsp │ │ -│ └──────┬──────┘ └──────┬──────┘ │ -│ │ VSCode API │ Child Process │ -│ ▼ ▼ │ -│ ┌─────────────┐ ┌─────────────┐ │ -│ │ VSCode │ │ pylsp │ │ -│ │ LS │ │ tsserver │ │ -│ └─────────────┘ │ gopls │ │ -│ └─────────────┘ │ -│ │ -│ Path 3: Index-Based (Current) │ -│ ───────────────────────────── │ -│ │ -│ ┌─────────────┐ │ -│ │ CCW MCP │ │ -│ │codex_lens_lsp│ │ -│ └──────┬──────┘ │ -│ │ Python subprocess │ -│ ▼ │ -│ ┌─────────────┐ │ -│ │ CodexLens │ │ -│ │ Index DB │ │ -│ └─────────────┘ │ -│ │ -└─────────────────────────────────────────────────────────────────────────────┘ -``` - -### B.2 Recommendation Matrix - -| Use Case | Recommended Path | Reason | -|----------|-----------------|--------| -| Claude Code + VSCode | Path 1: VSCode Bridge | Simplest, full VSCode features | -| CLI-only workflows | Path 2: Standalone LSP | No VSCode dependency | -| Quick search across indexed code | Path 3: Index-based | Fastest response | -| Multi-IDE support | Path 2: Standalone LSP | Standard protocol | -| Advanced refactoring | Path 1: VSCode Bridge | Full VSCode capabilities | - -### B.3 Hybrid Mode (Recommended) - -For maximum flexibility, implement all three paths: - -```javascript -// Smart routing in CCW -function selectLSPPath(request) { - // 1. Try VSCode Bridge first (if available) - if (await checkVSCodeBridge()) { - return "vscode_bridge"; - } - - // 2. Fall back to Standalone LSP - if (await checkStandaloneLSP(request.fileType)) { - return "standalone_lsp"; - } - - // 3. Last resort: Index-based - return "index_based"; -} -``` - ---- - -## Appendix C: Implementation Tasks Summary - -### C.1 VSCode Bridge Tasks - -| Task ID | Description | Priority | Status | -|---------|-------------|----------|--------| -| VB-1 | Create ccw-vscode-bridge extension structure | High | ✅ Done | -| VB-2 | Implement HTTP server in extension.ts | High | ✅ Done | -| VB-3 | Create vscode_lsp MCP tool | High | 🔄 Pending | -| VB-4 | Register tool in CCW | High | 🔄 Pending | -| VB-5 | Test with VSCode | Medium | 🔄 Pending | -| VB-6 | Add connection retry logic | Low | 🔄 Pending | - -### C.2 Standalone LSP Server Tasks - -| Task ID | Description | Priority | Status | -|---------|-------------|----------|--------| -| LSP-1 | Setup pygls project structure | High | 🔄 Pending | -| LSP-2 | Implement multiplexer | High | 🔄 Pending | -| LSP-3 | Core handlers (definition, references) | High | 🔄 Pending | -| LSP-4 | Position tolerance | Medium | 🔄 Pending | -| LSP-5 | Tests and documentation | Medium | 🔄 Pending | - -### C.3 Integration Tasks - -| Task ID | Description | Priority | Status | -|---------|-------------|----------|--------| -| INT-1 | Smart path routing | Medium | 🔄 Pending | -| INT-2 | Unified error handling | Medium | 🔄 Pending | -| INT-3 | Performance benchmarks | Low | 🔄 Pending | - ---- - -## Questions for Clarification - -Before implementation, confirm: - -1. **Implementation Priority**: Start with VSCode Bridge (simpler) or Standalone LSP (more general)? -2. **Language Priority**: Which languages are most important? (TypeScript, Python, Go, Rust, etc.) -3. **IDE Focus**: Target VS Code first, then others? -4. **Fallback Strategy**: Should we keep index-based search as fallback if LSP fails? -5. **Caching**: How much should we cache LS responses? -6. **Configuration**: Simple JSON config or more sophisticated format? - diff --git a/codex-lens/docs/SEARCH_ANALYSIS_SUMMARY.md b/codex-lens/docs/SEARCH_ANALYSIS_SUMMARY.md deleted file mode 100644 index d0d4c676..00000000 --- a/codex-lens/docs/SEARCH_ANALYSIS_SUMMARY.md +++ /dev/null @@ -1,192 +0,0 @@ -# CodexLens 搜索分析 - 执行摘要 - -## 🎯 核心发现 - -### 问题1:向量搜索为什么返回空结果? - -**根本原因**:向量嵌入数据不存在 - -- ✗ `semantic_chunks` 表未创建 -- ✗ 从未执行向量嵌入生成流程 -- ✗ 向量索引数据库实际是 SQLite 中的一个表,不是独立文件 - -**位置**:向量数据存储在 `~/.codexlens/indexes/项目名/_index.db` 的 `semantic_chunks` 表中 - -### 问题2:向量索引数据库在哪里? - -**存储架构**: -``` -~/.codexlens/indexes/ -└── project-name/ - └── _index.db ← SQLite数据库 - ├── files ← 文件索引表 - ├── files_fts ← FTS5全文索引 - ├── files_fts_fuzzy ← 模糊搜索索引 - └── semantic_chunks ← 向量嵌入表(当前不存在!) -``` - -**不是独立数据库**:向量数据集成在 SQLite 索引文件中,而不是单独的向量数据库。 - -### 问题3:当前架构是否发挥了并行效果? - -**✓ 是的!架构非常优秀** - -- **双层并行**: - - 第1层:单索引内,exact/fuzzy/vector 三种搜索方法并行 - - 第2层:跨多个目录索引并行搜索 -- **性能表现**:混合模式仅增加 1.6x 开销(9ms vs 5.6ms) -- **资源利用**:ThreadPoolExecutor 充分利用 I/O 并发 - -## ⚡ 快速修复 - -### 立即解决向量搜索问题 - -**步骤1:安装依赖** -```bash -pip install codexlens[semantic] -# 或 -pip install fastembed numpy -``` - -**步骤2:生成向量嵌入** - -创建脚本 `generate_embeddings.py`: -```python -from pathlib import Path -from codexlens.semantic.embedder import Embedder -from codexlens.semantic.vector_store import VectorStore -from codexlens.semantic.chunker import Chunker, ChunkConfig -import sqlite3 - -def generate_embeddings(index_db_path: Path): - embedder = Embedder(profile="code") - vector_store = VectorStore(index_db_path) - chunker = Chunker(config=ChunkConfig(max_chunk_size=2000)) - - with sqlite3.connect(index_db_path) as conn: - conn.row_factory = sqlite3.Row - files = conn.execute("SELECT full_path, content FROM files").fetchall() - - for file_row in files: - chunks = chunker.chunk_sliding_window( - file_row["content"], - file_path=file_row["full_path"], - language="python" - ) - for chunk in chunks: - chunk.embedding = embedder.embed_single(chunk.content) - if chunks: - vector_store.add_chunks(chunks, file_row["full_path"]) -``` - -**步骤3:执行生成** -```bash -python generate_embeddings.py ~/.codexlens/indexes/codex-lens/_index.db -``` - -**步骤4:验证** -```bash -# 检查数据 -sqlite3 ~/.codexlens/indexes/codex-lens/_index.db \ - "SELECT COUNT(*) FROM semantic_chunks" - -# 测试搜索 -codexlens search "authentication credentials" --mode vector -``` - -## 🔍 关键洞察 - -### 发现:Vector模式不是纯向量搜索 - -**当前行为**: -```python -# hybrid_search.py:73 -backends = {"exact": True} # ⚠️ exact搜索总是启用! -if enable_vector: - backends["vector"] = True -``` - -**影响**: -- "vector模式"实际是 **vector + exact 混合模式** -- 即使向量搜索返回空,仍有exact FTS结果 -- 这就是为什么"向量搜索"在无嵌入时也有结果 - -**建议修复**:添加 `pure_vector` 参数以支持真正的纯向量搜索 - -## 📊 搜索模式对比 - -| 模式 | 延迟 | 召回率 | 适用场景 | 需要嵌入 | -|------|------|--------|----------|---------| -| **exact** | 5.6ms | 中 | 代码标识符 | ✗ | -| **fuzzy** | 7.7ms | 高 | 容错搜索 | ✗ | -| **vector** | 7.4ms | 最高 | 语义搜索 | ✓ | -| **hybrid** | 9.0ms | 最高 | 通用搜索 | ✓ | - -**推荐**: -- 代码搜索 → `--mode exact` -- 自然语言 → `--mode hybrid`(需先生成嵌入) -- 容错搜索 → `--mode fuzzy` - -## 📈 优化路线图 - -### P0 - 立即 (本周) -- [x] 生成向量嵌入 -- [ ] 验证向量搜索可用 -- [ ] 更新使用文档 - -### P1 - 短期 (2周) -- [ ] 添加 `pure_vector` 模式 -- [ ] 增量嵌入更新 -- [ ] 改进错误提示 - -### P2 - 中期 (1-2月) -- [ ] 混合分块策略 -- [ ] 查询扩展 -- [ ] 自适应权重 - -### P3 - 长期 (3-6月) -- [ ] FAISS加速 -- [ ] 向量压缩 -- [ ] 多模态搜索 - -## 📚 详细文档 - -完整分析报告:`SEARCH_COMPARISON_ANALYSIS.md` - -包含内容: -- 详细问题诊断 -- 架构深度分析 -- 完整解决方案 -- 代码示例 -- 实施检查清单 - -## 🎓 学习要点 - -1. **向量搜索需要主动生成嵌入**:不会自动创建 -2. **双层并行架构很优秀**:无需额外优化 -3. **RRF融合算法工作良好**:多源结果合理融合 -4. **Vector模式非纯向量**:包含FTS作为后备 - -## 💡 下一步行动 - -```bash -# 1. 安装依赖 -pip install codexlens[semantic] - -# 2. 创建索引(如果还没有) -codexlens init ~/projects/your-project - -# 3. 生成嵌入 -python generate_embeddings.py ~/.codexlens/indexes/your-project/_index.db - -# 4. 测试搜索 -codexlens search "your natural language query" --mode hybrid -``` - ---- - -**问题解决**: ✓ 已识别并提供解决方案 -**架构评估**: ✓ 并行架构优秀,充分发挥效能 -**优化建议**: ✓ 提供短期、中期、长期优化路线 - -**联系**: 详见 `SEARCH_COMPARISON_ANALYSIS.md` 获取完整技术细节 diff --git a/codex-lens/docs/SEARCH_COMPARISON_ANALYSIS.md b/codex-lens/docs/SEARCH_COMPARISON_ANALYSIS.md deleted file mode 100644 index 9f2e66c9..00000000 --- a/codex-lens/docs/SEARCH_COMPARISON_ANALYSIS.md +++ /dev/null @@ -1,711 +0,0 @@ -# CodexLens 搜索模式对比分析报告 - -**生成时间**: 2025-12-16 -**分析目标**: 对比向量搜索和混合搜索效果,诊断向量搜索返回空结果的原因,评估并行架构效能 - ---- - -## 执行摘要 - -通过深入的代码分析和实验测试,我们发现了向量搜索在当前实现中的几个关键问题,并提供了针对性的优化方案。 - -### 核心发现 - -1. **向量搜索返回空结果的根本原因**:缺少向量嵌入数据(semantic_chunks表为空) -2. **混合搜索架构设计优秀**:使用了双层并行架构,性能表现良好 -3. **向量搜索模式的语义问题**:"vector模式"实际上总是包含exact搜索,不是纯向量搜索 - ---- - -## 1. 问题诊断 - -### 1.1 向量索引数据库位置 - -**存储架构**: -- **位置**: 向量数据集成存储在SQLite索引文件中(`_index.db`) -- **表名**: `semantic_chunks` -- **字段结构**: - - `id`: 主键 - - `file_path`: 文件路径 - - `content`: 代码块内容 - - `embedding`: 向量嵌入(BLOB格式,numpy float32数组) - - `metadata`: JSON格式元数据 - - `created_at`: 创建时间 - -**默认存储路径**: -- 全局索引: `~/.codexlens/indexes/` -- 项目索引: `项目目录/.codexlens/` -- 每个目录一个 `_index.db` 文件 - -**为什么没有看到向量数据库**: -向量数据不是独立数据库,而是与FTS索引共存于同一个SQLite文件中的`semantic_chunks`表。如果该表不存在或为空,说明从未生成过向量嵌入。 - -### 1.2 向量搜索返回空结果的原因 - -**代码分析** (`hybrid_search.py:195-253`): - -```python -def _search_vector(self, index_path: Path, query: str, limit: int) -> List[SearchResult]: - try: - # 检查1: semantic_chunks表是否存在 - conn = sqlite3.connect(index_path) - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'" - ) - has_semantic_table = cursor.fetchone() is not None - conn.close() - - if not has_semantic_table: - self.logger.debug("No semantic_chunks table found") - return [] # ❌ 返回空列表 - - # 检查2: 向量存储是否有数据 - vector_store = VectorStore(index_path) - if vector_store.count_chunks() == 0: - self.logger.debug("Vector store is empty") - return [] # ❌ 返回空列表 - - # 正常向量搜索流程... - except Exception as exc: - return [] # ❌ 异常也返回空列表 -``` - -**失败路径**: -1. `semantic_chunks`表不存在 → 返回空 -2. 表存在但无数据 → 返回空 -3. 语义搜索依赖未安装 → 返回空 -4. 任何异常 → 返回空 - -**当前状态诊断**: -通过测试验证,当前项目中: -- ✗ `semantic_chunks`表不存在 -- ✗ 未执行向量嵌入生成流程 -- ✗ 向量索引从未创建 - -**解决方案**:需要执行向量嵌入生成流程(见第3节) - -### 1.3 混合搜索 vs 向量搜索的实际行为 - -**重要发现**:当前实现中,"vector模式"并非纯向量搜索。 - -**代码证据** (`hybrid_search.py:72-77`): - -```python -def search(self, ...): - # Determine which backends to use - backends = {"exact": True} # ⚠️ exact搜索总是启用! - if enable_fuzzy: - backends["fuzzy"] = True - if enable_vector: - backends["vector"] = True -``` - -**影响**: -- 即使设置为"vector模式"(`enable_fuzzy=False, enable_vector=True`),exact搜索仍然运行 -- 当向量搜索返回空时,RRF融合仍会包含exact搜索的结果 -- 这导致"向量搜索"在没有嵌入数据时仍返回结果(来自exact FTS) - -**测试验证**: -``` -测试场景:有FTS索引但无向量嵌入 -查询:"authentication" - -预期行为(纯向量模式): - - 向量搜索: 0 结果(无嵌入数据) - - 最终结果: 0 - -实际行为: - - 向量搜索: 0 结果 - - Exact搜索: 3 结果 ✓ (总是运行) - - 最终结果: 3(来自exact,经过RRF) -``` - -**设计建议**: -1. **选项A(推荐)**: 添加纯向量模式标志 - ```python - backends = {} - if enable_vector and not pure_vector_mode: - backends["exact"] = True # 向量搜索的后备方案 - elif not enable_vector: - backends["exact"] = True # 非向量模式总是启用exact - ``` - -2. **选项B**: 文档明确说明当前行为 - - "vector模式"实际是"vector+exact混合模式" - - 提供警告信息当向量搜索返回空时 - ---- - -## 2. 并行架构分析 - -### 2.1 双层并行设计 - -CodexLens采用了优秀的双层并行架构: - -**第一层:搜索方法级并行** (`HybridSearchEngine`) - -```python -def _search_parallel(self, index_path, query, backends, limit): - with ThreadPoolExecutor(max_workers=len(backends)) as executor: - # 并行提交搜索任务 - if backends.get("exact"): - future = executor.submit(self._search_exact, ...) - if backends.get("fuzzy"): - future = executor.submit(self._search_fuzzy, ...) - if backends.get("vector"): - future = executor.submit(self._search_vector, ...) - - # 收集结果 - for future in as_completed(future_to_source): - results = future.result() -``` - -**特点**: -- 在**单个索引**内,exact/fuzzy/vector三种搜索方法并行执行 -- 使用`ThreadPoolExecutor`实现I/O密集型任务并行 -- 使用`as_completed`实现结果流式收集 -- 动态worker数量(与启用的backend数量相同) - -**性能测试结果**: -``` -搜索模式 | 平均延迟 | 相对overhead ------------|----------|------------- -Exact only | 5.6ms | 1.0x (基线) -Fuzzy only | 7.7ms | 1.4x -Vector only| 7.4ms | 1.3x -Hybrid (all)| 9.0ms | 1.6x -``` - -**分析**: -- ✓ Hybrid模式开销合理(<2x),证明并行有效 -- ✓ 单次搜索延迟仍保持在10ms以下(优秀) - -**第二层:索引级并行** (`ChainSearchEngine`) - -```python -def _search_parallel(self, index_paths, query, options): - executor = self._get_executor(options.max_workers) - - # 为每个索引提交搜索任务 - future_to_path = { - executor.submit( - self._search_single_index, - idx_path, query, ... - ): idx_path - for idx_path in index_paths - } - - # 收集所有索引的结果 - for future in as_completed(future_to_path): - results = future.result() - all_results.extend(results) -``` - -**特点**: -- 跨**多个目录索引**并行搜索 -- 共享线程池(避免线程创建开销) -- 可配置worker数量(默认8) -- 结果去重和RRF融合 - -### 2.2 并行效能评估 - -**优势**: -1. ✓ **架构清晰**:双层并行职责明确,互不干扰 -2. ✓ **资源利用**:I/O密集型任务充分利用线程池 -3. ✓ **扩展性**:易于添加新的搜索后端 -4. ✓ **容错性**:单个后端失败不影响其他后端 - -**当前利用率**: -- 单索引搜索:并行度 = min(3, 启用的backend数量) -- 多索引搜索:并行度 = min(8, 索引数量) -- **充分发挥**:只要有多个索引或多个backend - -**潜在优化点**: -1. **CPU密集型任务**:向量相似度计算已使用numpy向量化,无需额外并行 -2. **缓存优化**:`VectorStore`已实现embedding matrix缓存,性能良好 -3. **动态worker调度**:当前固定worker数,可根据任务负载动态调整 - ---- - -## 3. 解决方案与优化建议 - -### 3.1 立即修复:生成向量嵌入 - -**步骤1:安装语义搜索依赖** - -```bash -# 方式A:完整安装 -pip install codexlens[semantic] - -# 方式B:手动安装依赖 -pip install fastembed numpy -``` - -**步骤2:创建向量索引脚本** - -保存为 `scripts/generate_embeddings.py`: - -```python -"""Generate vector embeddings for existing indexes.""" - -import logging -import sqlite3 -from pathlib import Path - -from codexlens.semantic.embedder import Embedder -from codexlens.semantic.vector_store import VectorStore -from codexlens.semantic.chunker import Chunker, ChunkConfig - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -def generate_embeddings_for_index(index_db_path: Path): - """Generate embeddings for all files in an index.""" - logger.info(f"Processing index: {index_db_path}") - - # Initialize components - embedder = Embedder(profile="code") # Use code-optimized model - vector_store = VectorStore(index_db_path) - chunker = Chunker(config=ChunkConfig(max_chunk_size=2000)) - - # Read files from index - with sqlite3.connect(index_db_path) as conn: - conn.row_factory = sqlite3.Row - cursor = conn.execute("SELECT full_path, content, language FROM files") - files = cursor.fetchall() - - logger.info(f"Found {len(files)} files to process") - - # Process each file - total_chunks = 0 - for file_row in files: - file_path = file_row["full_path"] - content = file_row["content"] - language = file_row["language"] or "python" - - try: - # Create chunks - chunks = chunker.chunk_sliding_window( - content, - file_path=file_path, - language=language - ) - - if not chunks: - logger.debug(f"No chunks created for {file_path}") - continue - - # Generate embeddings - for chunk in chunks: - embedding = embedder.embed_single(chunk.content) - chunk.embedding = embedding - - # Store chunks - vector_store.add_chunks(chunks, file_path) - total_chunks += len(chunks) - logger.info(f"✓ {file_path}: {len(chunks)} chunks") - - except Exception as exc: - logger.error(f"✗ {file_path}: {exc}") - - logger.info(f"Completed: {total_chunks} total chunks indexed") - return total_chunks - - -def main(): - import sys - - if len(sys.argv) < 2: - print("Usage: python generate_embeddings.py ") - print("Example: python generate_embeddings.py ~/.codexlens/indexes/project/_index.db") - sys.exit(1) - - index_path = Path(sys.argv[1]) - - if not index_path.exists(): - print(f"Error: Index not found at {index_path}") - sys.exit(1) - - generate_embeddings_for_index(index_path) - - -if __name__ == "__main__": - main() -``` - -**步骤3:执行生成** - -```bash -# 为特定项目生成嵌入 -python scripts/generate_embeddings.py ~/.codexlens/indexes/codex-lens/_index.db - -# 或使用find批量处理 -find ~/.codexlens/indexes -name "_index.db" -type f | while read db; do - python scripts/generate_embeddings.py "$db" -done -``` - -**步骤4:验证生成结果** - -```bash -# 检查semantic_chunks表 -sqlite3 ~/.codexlens/indexes/codex-lens/_index.db \ - "SELECT COUNT(*) as chunk_count FROM semantic_chunks" - -# 测试向量搜索 -codexlens search "authentication user credentials" \ - --path ~/projects/codex-lens \ - --mode vector -``` - -### 3.2 短期优化:改进向量搜索语义 - -**问题**:当前"vector模式"实际包含exact搜索,语义不清晰 - -**解决方案**:添加`pure_vector`参数 - -**实现** (修改 `hybrid_search.py`): - -```python -class HybridSearchEngine: - def search( - self, - index_path: Path, - query: str, - limit: int = 20, - enable_fuzzy: bool = True, - enable_vector: bool = False, - pure_vector: bool = False, # 新增参数 - ) -> List[SearchResult]: - """Execute hybrid search with parallel retrieval and RRF fusion. - - Args: - ... - pure_vector: If True, only use vector search (no FTS fallback) - """ - # Determine which backends to use - backends = {} - - if pure_vector: - # 纯向量模式:只使用向量搜索 - if enable_vector: - backends["vector"] = True - else: - # 混合模式:总是包含exact搜索作为基线 - backends["exact"] = True - if enable_fuzzy: - backends["fuzzy"] = True - if enable_vector: - backends["vector"] = True - - # ... rest of the method -``` - -**CLI更新** (修改 `commands.py`): - -```python -@app.command() -def search( - ... - mode: str = typer.Option("exact", "--mode", "-m", - help="Search mode: exact, fuzzy, hybrid, vector, pure-vector."), - ... -): - """... - Search Modes: - - exact: Exact FTS - - fuzzy: Fuzzy FTS - - hybrid: RRF fusion of exact + fuzzy + vector (recommended) - - vector: Vector search with exact FTS fallback - - pure-vector: Pure semantic vector search (no FTS fallback) - """ - ... - - # Map mode to options - if mode == "exact": - hybrid_mode, enable_fuzzy, enable_vector, pure_vector = False, False, False, False - elif mode == "fuzzy": - hybrid_mode, enable_fuzzy, enable_vector, pure_vector = False, True, False, False - elif mode == "vector": - hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, False, True, False - elif mode == "pure-vector": - hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, False, True, True - elif mode == "hybrid": - hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, True, True, False -``` - -### 3.3 中期优化:增强向量搜索效果 - -**优化1:改进分块策略** - -当前使用简单的滑动窗口,可优化为: - -```python -class HybridChunker(Chunker): - """Hybrid chunking strategy combining symbol-based and sliding window.""" - - def chunk_hybrid( - self, - content: str, - symbols: List[Symbol], - file_path: str, - language: str, - ) -> List[SemanticChunk]: - """ - 1. 优先按symbol分块(函数、类级别) - 2. 对过大symbol,进一步使用滑动窗口 - 3. 对symbol间隙,使用滑动窗口补充 - """ - chunks = [] - - # Step 1: Symbol-based chunks - symbol_chunks = self.chunk_by_symbol(content, symbols, file_path, language) - - # Step 2: Split oversized symbols - for chunk in symbol_chunks: - if chunk.token_count > self.config.max_chunk_size: - # 使用滑动窗口进一步分割 - sub_chunks = self._split_large_chunk(chunk) - chunks.extend(sub_chunks) - else: - chunks.append(chunk) - - # Step 3: Fill gaps with sliding window - gap_chunks = self._chunk_gaps(content, symbols, file_path, language) - chunks.extend(gap_chunks) - - return chunks -``` - -**优化2:添加查询扩展** - -```python -class QueryExpander: - """Expand queries for better vector search recall.""" - - def expand(self, query: str) -> str: - """Expand query with synonyms and related terms.""" - # 示例:代码领域同义词 - expansions = { - "auth": ["authentication", "authorization", "login"], - "db": ["database", "storage", "repository"], - "api": ["endpoint", "route", "interface"], - } - - terms = query.lower().split() - expanded = set(terms) - - for term in terms: - if term in expansions: - expanded.update(expansions[term]) - - return " ".join(expanded) -``` - -**优化3:混合检索策略** - -```python -class AdaptiveHybridSearch: - """Adaptive search strategy based on query type.""" - - def search(self, query: str, ...): - # 分析查询类型 - query_type = self._classify_query(query) - - if query_type == "keyword": - # 代码标识符查询 → 偏重FTS - weights = {"exact": 0.5, "fuzzy": 0.3, "vector": 0.2} - elif query_type == "semantic": - # 自然语言查询 → 偏重向量 - weights = {"exact": 0.2, "fuzzy": 0.2, "vector": 0.6} - elif query_type == "hybrid": - # 混合查询 → 平衡权重 - weights = {"exact": 0.4, "fuzzy": 0.3, "vector": 0.3} - - return self.engine.search(query, weights=weights, ...) -``` - -### 3.4 长期优化:性能与质量提升 - -**优化1:增量嵌入更新** - -```python -class IncrementalEmbeddingUpdater: - """Update embeddings incrementally for changed files.""" - - def update_for_file(self, file_path: str, new_content: str): - """Only regenerate embeddings for changed file.""" - # 1. 删除旧嵌入 - self.vector_store.delete_file_chunks(file_path) - - # 2. 生成新嵌入 - chunks = self.chunker.chunk(new_content, ...) - for chunk in chunks: - chunk.embedding = self.embedder.embed_single(chunk.content) - - # 3. 存储新嵌入 - self.vector_store.add_chunks(chunks, file_path) -``` - -**优化2:向量索引压缩** - -```python -# 使用量化技术减少存储空间(768维 → 192维) -from qdrant_client import models - -# 产品量化(PQ)压缩 -compressed_vector = pq_quantize(embedding, target_dim=192) -``` - -**优化3:向量搜索加速** - -```python -# 使用FAISS或Hnswlib替代numpy暴力搜索 -import faiss - -class FAISSVectorStore(VectorStore): - def __init__(self, db_path, dim=768): - super().__init__(db_path) - # 使用HNSW索引 - self.index = faiss.IndexHNSWFlat(dim, 32) - self._load_vectors_to_index() - - def search_similar(self, query_embedding, top_k=10): - # FAISS加速搜索(100x+) - scores, indices = self.index.search( - np.array([query_embedding]), top_k - ) - return self._fetch_by_indices(indices[0], scores[0]) -``` - ---- - -## 4. 对比总结 - -### 4.1 搜索模式对比 - -| 维度 | Exact FTS | Fuzzy FTS | Vector Search | Hybrid (推荐) | -|------|-----------|-----------|---------------|--------------| -| **匹配类型** | 精确词匹配 | 容错匹配 | 语义相似 | 多模式融合 | -| **查询类型** | 标识符、关键词 | 拼写错误容忍 | 自然语言 | 所有类型 | -| **召回率** | 中 | 高 | 最高 | 最高 | -| **精确率** | 高 | 中 | 中 | 高 | -| **延迟** | 5-7ms | 7-9ms | 7-10ms | 9-11ms | -| **依赖** | 仅SQLite | 仅SQLite | fastembed+numpy | 全部 | -| **存储开销** | 小(FTS索引) | 小(FTS索引) | 大(向量) | 大(FTS+向量) | -| **适用场景** | 代码搜索 | 容错搜索 | 概念搜索 | 通用搜索 | - -### 4.2 推荐使用策略 - -**场景1:代码标识符搜索**(函数名、类名、变量名) -```bash -codexlens search "authenticate_user" --mode exact -``` -→ 使用exact模式,最快且最精确 - -**场景2:概念性搜索**("如何验证用户身份") -```bash -codexlens search "how to verify user credentials" --mode hybrid -``` -→ 使用hybrid模式,结合语义和关键词 - -**场景3:容错搜索**(允许拼写错误) -```bash -codexlens search "autheticate" --mode fuzzy -``` -→ 使用fuzzy模式,trigram容错 - -**场景4:纯语义搜索**(需先生成嵌入) -```bash -codexlens search "password encryption with salt" --mode pure-vector -``` -→ 使用pure-vector模式,理解语义意图 - ---- - -## 5. 实施检查清单 - -### 立即行动项 (P0) - -- [ ] 安装语义搜索依赖:`pip install codexlens[semantic]` -- [ ] 运行嵌入生成脚本(见3.1节) -- [ ] 验证semantic_chunks表已创建且有数据 -- [ ] 测试vector模式搜索是否返回结果 - -### 短期改进 (P1) - -- [ ] 添加pure_vector参数(见3.2节) -- [ ] 更新CLI支持pure-vector模式 -- [ ] 添加嵌入生成进度提示 -- [ ] 文档更新:搜索模式使用指南 - -### 中期优化 (P2) - -- [ ] 实现混合分块策略(见3.3节) -- [ ] 添加查询扩展功能 -- [ ] 实现自适应权重调整 -- [ ] 性能基准测试 - -### 长期规划 (P3) - -- [ ] 增量嵌入更新机制 -- [ ] 向量索引压缩 -- [ ] 集成FAISS加速 -- [ ] 多模态搜索(代码+文档) - ---- - -## 6. 参考资源 - -### 代码文件 - -- 混合搜索引擎: `codex-lens/src/codexlens/search/hybrid_search.py` -- 向量存储: `codex-lens/src/codexlens/semantic/vector_store.py` -- 向量嵌入: `codex-lens/src/codexlens/semantic/embedder.py` -- 代码分块: `codex-lens/src/codexlens/semantic/chunker.py` -- 链式搜索: `codex-lens/src/codexlens/search/chain_search.py` - -### 测试文件 - -- 对比测试: `codex-lens/tests/test_search_comparison.py` -- 混合搜索E2E: `codex-lens/tests/test_hybrid_search_e2e.py` -- CLI测试: `codex-lens/tests/test_cli_hybrid_search.py` - -### 相关文档 - -- RRF算法: `codex-lens/src/codexlens/search/ranking.py` -- 查询解析: `codex-lens/src/codexlens/search/query_parser.py` -- 配置管理: `codex-lens/src/codexlens/config.py` - ---- - -## 7. 结论 - -通过本次深入分析,我们明确了CodexLens搜索系统的优势和待优化点: - -**优势**: -1. ✓ 优秀的并行架构设计(双层并行) -2. ✓ RRF融合算法实现合理 -3. ✓ 向量存储实现高效(numpy向量化+缓存) -4. ✓ 模块化设计,易于扩展 - -**待优化**: -1. 向量嵌入生成流程需要手动触发 -2. "vector模式"语义不清晰(实际包含exact搜索) -3. 分块策略可以优化(混合策略) -4. 缺少增量更新机制 - -**核心建议**: -1. **立即**: 生成向量嵌入,解决返回空结果问题 -2. **短期**: 添加纯向量模式,澄清语义 -3. **中期**: 优化分块和查询策略,提升搜索质量 -4. **长期**: 性能优化和高级特性 - -通过实施这些改进,CodexLens的搜索功能将达到生产级别的质量和性能标准。 - ---- - -**报告完成时间**: 2025-12-16 -**分析工具**: 代码静态分析 + 实验测试 + 性能测评 -**下一步**: 实施P0优先级改进项 diff --git a/codex-lens/docs/SEMANTIC_GRAPH_DESIGN.md b/codex-lens/docs/SEMANTIC_GRAPH_DESIGN.md deleted file mode 100644 index 709bd1ba..00000000 --- a/codex-lens/docs/SEMANTIC_GRAPH_DESIGN.md +++ /dev/null @@ -1,1113 +0,0 @@ -# 静态分析语义图谱设计方案 - -## 1. 背景与目标 - -### 1.1 当前问题 - -现有的 `llm_enhancer.py` 对代码的分析是**孤立的、原子化的**: -- 每个函数/类被视为独立单元 -- 无法识别函数调用关系 -- 无法追踪数据流 -- 无法理解模块依赖 - -这导致无法回答以下类型的问题: -- "修改这个函数会影响哪些模块?" -- "这个API的完整数据流路径是什么?" -- "找出所有操作User实体的写入方法" -- "哪些函数依赖这个配置参数?" - -### 1.2 设计目标 - -构建**代码语义图谱**(Code Semantic Graph),实现: - -1. **调用关系分析**:函数/方法调用图(Call Graph) -2. **数据流追踪**:变量定义、使用、传递路径 -3. **依赖关系管理**:模块/类/包之间的依赖 -4. **实体关系映射**:识别数据模型及其操作方法 -5. **LLM增强的语义理解**:结合静态分析和LLM,理解调用的"意图" - -## 2. 技术架构 - -### 2.1 整体架构 - -``` -Source Code Files - ↓ -[Static Analysis Layer] - ├─ AST Parsing (tree-sitter) - ├─ Call Graph Extraction - ├─ Data Flow Analysis - └─ Dependency Resolution - ↓ -[Graph Construction Layer] - ├─ Node Creation (Functions/Classes/Modules) - ├─ Edge Creation (Calls/Imports/DataFlow) - └─ Graph Storage (SQLite/Neo4j) - ↓ -[LLM Enhancement Layer] - ├─ Relationship Semantics - ├─ Intent Analysis - └─ Pattern Recognition - ↓ -[Query & Reasoning Layer] - ├─ Graph Traversal - ├─ Impact Analysis - └─ Semantic Search Integration -``` - -### 2.2 核心组件 - -#### 2.2.1 图节点类型(Nodes) - -```python -from enum import Enum -from dataclasses import dataclass -from typing import List, Optional, Set - -class NodeType(Enum): - """节点类型""" - MODULE = "module" # 模块/文件 - CLASS = "class" # 类 - FUNCTION = "function" # 函数 - METHOD = "method" # 方法 - VARIABLE = "variable" # 变量 - PARAMETER = "parameter" # 参数 - DATA_MODEL = "data_model" # 数据模型(识别出的实体类) - -@dataclass -class CodeNode: - """代码图节点""" - node_id: str # 唯一标识:file:line:name - node_type: NodeType - name: str - qualified_name: str # 完全限定名:module.class.method - file_path: str - start_line: int - end_line: int - - # 静态分析元数据 - signature: Optional[str] = None # 函数/方法签名 - docstring: Optional[str] = None - modifiers: Set[str] = None # public/private/static等 - - # LLM生成的语义元数据 - summary: Optional[str] = None - purpose: Optional[str] = None - tags: List[str] = None # 如:crud, validation, auth -``` - -#### 2.2.2 图边类型(Edges) - -```python -class EdgeType(Enum): - """边类型""" - CALLS = "calls" # A调用B - IMPORTS = "imports" # A导入B - INHERITS = "inherits" # A继承B - IMPLEMENTS = "implements" # A实现B(接口) - USES_VARIABLE = "uses_variable" # A使用变量B - DEFINES_VARIABLE = "defines_variable" # A定义变量B - PASSES_DATA = "passes_data" # A向B传递数据 - MODIFIES = "modifies" # A修改B(如数据库写入) - READS = "reads" # A读取B(如数据库查询) - -@dataclass -class CodeEdge: - """代码图边""" - edge_id: str - source_id: str # 源节点ID - target_id: str # 目标节点ID - edge_type: EdgeType - - # 边的上下文信息 - context: Optional[str] = None # 调用发生的代码片段 - line_number: Optional[int] = None # 调用所在行号 - - # LLM生成的语义 - semantic_intent: Optional[str] = None # 如"验证用户权限" - confidence: float = 1.0 # 置信度 -``` - -## 3. 详细实现步骤 - -### 3.1 静态分析引擎 - -#### 3.1.1 AST解析与符号提取 - -```python -from tree_sitter import Language, Parser -from pathlib import Path -from typing import Dict, List - -class ASTAnalyzer: - """基于tree-sitter的AST分析器""" - - def __init__(self, language: str): - self.language = language - self.parser = Parser() - # 加载语言grammar - - def extract_symbols(self, content: str, file_path: str) -> List[CodeNode]: - """提取所有符号定义""" - - tree = self.parser.parse(bytes(content, 'utf-8')) - root = tree.root_node - - symbols = [] - self._traverse_definitions(root, content, file_path, symbols) - return symbols - - def _traverse_definitions( - self, - node, - content: str, - file_path: str, - result: List[CodeNode], - parent_class: str = None - ): - """递归遍历提取定义""" - - if node.type == 'function_definition': - func_node = self._create_function_node(node, content, file_path) - result.append(func_node) - - elif node.type == 'class_definition': - class_node = self._create_class_node(node, content, file_path) - result.append(class_node) - - # 遍历类内部的方法 - for child in node.children: - if child.type == 'block': - for method in child.children: - if method.type == 'function_definition': - method_node = self._create_method_node( - method, content, file_path, class_node.name - ) - result.append(method_node) - - # 递归遍历子节点 - for child in node.children: - self._traverse_definitions( - child, content, file_path, result, parent_class - ) - - def _create_function_node(self, node, content: str, file_path: str) -> CodeNode: - """创建函数节点""" - - name_node = node.child_by_field_name('name') - func_name = content[name_node.start_byte:name_node.end_byte] - - # 提取参数列表 - params_node = node.child_by_field_name('parameters') - signature = content[params_node.start_byte:params_node.end_byte] - - # 提取docstring - docstring = self._extract_docstring(node, content) - - return CodeNode( - node_id=f"{file_path}:{node.start_point[0]}:{func_name}", - node_type=NodeType.FUNCTION, - name=func_name, - qualified_name=f"{Path(file_path).stem}.{func_name}", - file_path=file_path, - start_line=node.start_point[0] + 1, - end_line=node.end_point[0] + 1, - signature=f"{func_name}{signature}", - docstring=docstring, - ) - - def _extract_docstring(self, node, content: str) -> Optional[str]: - """提取docstring""" - - # 查找函数体的第一个表达式语句 - body = node.child_by_field_name('body') - if not body: - return None - - for child in body.children: - if child.type == 'expression_statement': - expr = child.children[0] - if expr.type == 'string': - # 提取字符串内容(去掉引号) - doc = content[expr.start_byte:expr.end_byte] - return doc.strip('"""').strip("'''").strip() - - return None -``` - -#### 3.1.2 调用图提取 - -```python -class CallGraphExtractor: - """调用图提取器""" - - def __init__(self, ast_analyzer: ASTAnalyzer): - self.ast_analyzer = ast_analyzer - - def extract_calls( - self, - content: str, - file_path: str, - symbols: List[CodeNode] - ) -> List[CodeEdge]: - """提取函数调用关系""" - - tree = self.ast_analyzer.parser.parse(bytes(content, 'utf-8')) - calls = [] - - # 为每个函数/方法提取其内部的调用 - for symbol in symbols: - if symbol.node_type in [NodeType.FUNCTION, NodeType.METHOD]: - symbol_calls = self._extract_calls_in_function( - tree, symbol, content, file_path - ) - calls.extend(symbol_calls) - - return calls - - def _extract_calls_in_function( - self, - tree, - caller: CodeNode, - content: str, - file_path: str - ) -> List[CodeEdge]: - """提取单个函数内的所有调用""" - - # 定位到函数的AST节点 - func_node = self._find_node_by_line(tree.root_node, caller.start_line) - if not func_node: - return [] - - calls = [] - self._traverse_calls(func_node, caller, content, file_path, calls) - return calls - - def _traverse_calls( - self, - node, - caller: CodeNode, - content: str, - file_path: str, - result: List[CodeEdge] - ): - """递归遍历查找call表达式""" - - if node.type == 'call': - # 提取被调用的函数名 - function_node = node.child_by_field_name('function') - callee_name = content[function_node.start_byte:function_node.end_byte] - - # 提取调用的上下文(所在行) - call_line = node.start_point[0] + 1 - line_content = content.splitlines()[node.start_point[0]] - - edge = CodeEdge( - edge_id=f"{caller.node_id}→{callee_name}:{call_line}", - source_id=caller.node_id, - target_id=callee_name, # 暂时用名称,后续需要解析 - edge_type=EdgeType.CALLS, - context=line_content.strip(), - line_number=call_line, - ) - result.append(edge) - - # 递归遍历 - for child in node.children: - self._traverse_calls(child, caller, content, file_path, result) - - def _find_node_by_line(self, node, target_line: int): - """根据行号查找AST节点""" - - if node.start_point[0] + 1 == target_line: - return node - - for child in node.children: - result = self._find_node_by_line(child, target_line) - if result: - return result - - return None -``` - -#### 3.1.3 名称解析(Name Resolution) - -```python -class NameResolver: - """将函数调用的名称解析为具体的符号定义""" - - def __init__(self, symbol_table: Dict[str, CodeNode]): - """ - symbol_table: 符号表,映射 qualified_name -> CodeNode - """ - self.symbol_table = symbol_table - - def resolve_call_target( - self, - call_edge: CodeEdge, - caller_context: CodeNode - ) -> Optional[str]: - """ - 解析调用目标的完整node_id - - 策略: - 1. 检查是否是本地函数调用(同文件) - 2. 检查是否是导入的模块函数 - 3. 检查是否是方法调用(self.method) - """ - - callee_name = call_edge.target_id - - # 策略1: 本地调用(同文件) - local_qualified = f"{Path(caller_context.file_path).stem}.{callee_name}" - if local_qualified in self.symbol_table: - return self.symbol_table[local_qualified].node_id - - # 策略2: 方法调用(提取对象名) - if '.' in callee_name: - parts = callee_name.split('.') - if parts[0] == 'self': - # self.method_name -> 在当前类中查找 - method_name = parts[1] - # 需要找到caller所属的类 - class_name = self._find_containing_class(caller_context) - if class_name: - class_qualified = f"{Path(caller_context.file_path).stem}.{class_name}.{method_name}" - if class_qualified in self.symbol_table: - return self.symbol_table[class_qualified].node_id - - # 策略3: 导入的函数(需要扫描import语句) - # TODO: 实现跨文件的导入解析 - - return None - - def _find_containing_class(self, node: CodeNode) -> Optional[str]: - """找到函数/方法所属的类""" - # 通过qualified_name推断 - parts = node.qualified_name.split('.') - if len(parts) > 2: # module.class.method - return parts[-2] - return None -``` - -### 3.2 图存储与索引 - -#### 3.2.1 数据库Schema(SQLite版本) - -```sql --- 节点表 -CREATE TABLE code_nodes ( - node_id TEXT PRIMARY KEY, - node_type TEXT NOT NULL, -- module/class/function/method/variable - name TEXT NOT NULL, - qualified_name TEXT NOT NULL UNIQUE, - file_path TEXT NOT NULL, - start_line INTEGER NOT NULL, - end_line INTEGER NOT NULL, - - -- 静态分析元数据 - signature TEXT, - docstring TEXT, - modifiers TEXT, -- JSON数组 - - -- LLM语义元数据 - summary TEXT, - purpose TEXT, - tags TEXT, -- JSON数组 - - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP -); - --- 边表 -CREATE TABLE code_edges ( - edge_id TEXT PRIMARY KEY, - source_id TEXT NOT NULL, - target_id TEXT NOT NULL, - edge_type TEXT NOT NULL, -- calls/imports/inherits/uses_variable等 - - -- 上下文 - context TEXT, - line_number INTEGER, - - -- LLM语义 - semantic_intent TEXT, - confidence REAL DEFAULT 1.0, - - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - - FOREIGN KEY (source_id) REFERENCES code_nodes(node_id) ON DELETE CASCADE, - FOREIGN KEY (target_id) REFERENCES code_nodes(node_id) ON DELETE CASCADE -); - --- 索引 -CREATE INDEX idx_nodes_type ON code_nodes(node_type); -CREATE INDEX idx_nodes_file ON code_nodes(file_path); -CREATE INDEX idx_nodes_qualified ON code_nodes(qualified_name); - -CREATE INDEX idx_edges_source ON code_edges(source_id); -CREATE INDEX idx_edges_target ON code_edges(target_id); -CREATE INDEX idx_edges_type ON code_edges(edge_type); - --- 用于快速查找调用关系 -CREATE INDEX idx_edges_source_type ON code_edges(source_id, edge_type); -CREATE INDEX idx_edges_target_type ON code_edges(target_id, edge_type); -``` - -#### 3.2.2 图存储接口 - -```python -import sqlite3 -from typing import List, Optional, Set - -class CodeGraphStore: - """代码图谱存储""" - - def __init__(self, db_path: Path): - self.db_path = db_path - self.conn = sqlite3.connect(db_path) - self._create_tables() - - def add_node(self, node: CodeNode): - """添加节点""" - cursor = self.conn.cursor() - cursor.execute(""" - INSERT OR REPLACE INTO code_nodes ( - node_id, node_type, name, qualified_name, - file_path, start_line, end_line, - signature, docstring, summary, purpose - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, ( - node.node_id, node.node_type.value, node.name, - node.qualified_name, node.file_path, - node.start_line, node.end_line, - node.signature, node.docstring, - node.summary, node.purpose - )) - self.conn.commit() - - def add_edge(self, edge: CodeEdge): - """添加边""" - cursor = self.conn.cursor() - cursor.execute(""" - INSERT OR REPLACE INTO code_edges ( - edge_id, source_id, target_id, edge_type, - context, line_number, semantic_intent, confidence - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) - """, ( - edge.edge_id, edge.source_id, edge.target_id, - edge.edge_type.value, edge.context, edge.line_number, - edge.semantic_intent, edge.confidence - )) - self.conn.commit() - - def get_node(self, node_id: str) -> Optional[CodeNode]: - """获取节点""" - cursor = self.conn.cursor() - cursor.execute("SELECT * FROM code_nodes WHERE node_id = ?", (node_id,)) - row = cursor.fetchone() - return self._row_to_node(row) if row else None - - def get_callers(self, node_id: str) -> List[CodeNode]: - """获取所有调用该节点的节点(反向查询)""" - cursor = self.conn.cursor() - cursor.execute(""" - SELECT n.* FROM code_nodes n - JOIN code_edges e ON n.node_id = e.source_id - WHERE e.target_id = ? AND e.edge_type = 'calls' - """, (node_id,)) - - return [self._row_to_node(row) for row in cursor.fetchall()] - - def get_callees(self, node_id: str) -> List[CodeNode]: - """获取该节点调用的所有节点(正向查询)""" - cursor = self.conn.cursor() - cursor.execute(""" - SELECT n.* FROM code_nodes n - JOIN code_edges e ON n.node_id = e.target_id - WHERE e.source_id = ? AND e.edge_type = 'calls' - """, (node_id,)) - - return [self._row_to_node(row) for row in cursor.fetchall()] - - def get_call_chain( - self, - start_node_id: str, - max_depth: int = 5 - ) -> List[List[CodeNode]]: - """获取调用链(DFS遍历)""" - - visited = set() - chains = [] - - def dfs(node_id: str, path: List[CodeNode], depth: int): - if depth > max_depth or node_id in visited: - return - - visited.add(node_id) - node = self.get_node(node_id) - if not node: - return - - current_path = path + [node] - callees = self.get_callees(node_id) - - if not callees: - # 叶子节点,记录完整路径 - chains.append(current_path) - else: - for callee in callees: - dfs(callee.node_id, current_path, depth + 1) - - visited.remove(node_id) - - dfs(start_node_id, [], 0) - return chains -``` - -### 3.3 LLM语义增强 - -#### 3.3.1 关系语义分析 - -```python -class RelationshipSemanticAnalyzer: - """为代码关系生成语义描述""" - - def __init__(self, llm_enhancer: LLMEnhancer): - self.llm_enhancer = llm_enhancer - - def analyze_call_intent( - self, - edge: CodeEdge, - caller: CodeNode, - callee: CodeNode - ) -> str: - """分析函数调用的意图""" - - # 构建提示词 - prompt = f""" -PURPOSE: Analyze the intent of a function call relationship -TASK: Describe in 1 sentence WHY function A calls function B and what purpose it serves - -CONTEXT: -Caller Function: {caller.name} -Caller Summary: {caller.summary or 'N/A'} -Caller Code Snippet: -``` -{edge.context} -``` - -Callee Function: {callee.name} -Callee Summary: {callee.summary or 'N/A'} -Callee Signature: {callee.signature or 'N/A'} - -OUTPUT: A concise semantic description of the call intent. -Example: "validates user credentials before granting access" -""" - - response = self.llm_enhancer._invoke_ccw_cli(prompt, tool='gemini') - if response['success']: - intent = response['stdout'].strip() - return intent - - return "unknown intent" - - def batch_analyze_calls( - self, - edges: List[CodeEdge], - nodes_map: Dict[str, CodeNode] - ) -> Dict[str, str]: - """批量分析调用意图(优化LLM调用)""" - - # 构建批量prompt - call_descriptions = [] - for edge in edges: - caller = nodes_map.get(edge.source_id) - callee = nodes_map.get(edge.target_id) - if not caller or not callee: - continue - - desc = f""" -[CALL {edge.edge_id}] -From: {caller.name} ({caller.summary or 'no summary'}) -To: {callee.name} ({callee.summary or 'no summary'}) -Context: {edge.context} -""" - call_descriptions.append(desc) - - prompt = f""" -PURPOSE: Analyze multiple function call relationships and describe their intents -TASK: For each call, provide a 1-sentence semantic description - -{chr(10).join(call_descriptions)} - -OUTPUT FORMAT (JSON): -{{ - "edge_id_1": "intent description", - "edge_id_2": "intent description", - ... -}} -""" - - response = self.llm_enhancer._invoke_ccw_cli(prompt, tool='gemini') - if response['success']: - import json - intents = json.loads(self.llm_enhancer._extract_json(response['stdout'])) - return intents - - return {} -``` - -#### 3.3.2 数据模型识别 - -```python -class DataModelRecognizer: - """识别代码中的数据模型(实体类)""" - - def identify_data_models( - self, - class_nodes: List[CodeNode] - ) -> List[CodeNode]: - """识别哪些类是数据模型""" - - data_models = [] - - for class_node in class_nodes: - # 启发式规则 - is_model = False - - # 规则1: 类名包含Model/Entity/Schema - if any(keyword in class_node.name for keyword in ['Model', 'Entity', 'Schema']): - is_model = True - - # 规则2: 继承自ORM基类(需要分析继承关系) - # TODO: 检查是否继承 db.Model, BaseModel等 - - # 规则3: 让LLM判断 - if not is_model: - is_model = self._ask_llm_if_data_model(class_node) - - if is_model: - class_node.tags = class_node.tags or [] - class_node.tags.append('data_model') - data_models.append(class_node) - - return data_models - - def _ask_llm_if_data_model(self, class_node: CodeNode) -> bool: - """让LLM判断是否为数据模型""" - - prompt = f""" -Is this Python class a data model (entity class representing database table or data structure)? - -Class Definition: -```python -{class_node.docstring or ''} -class {class_node.name}: - # ... (signature: {class_node.signature}) -``` - -Answer with: YES or NO -""" - - # 调用LLM... - # 简化实现 - return False -``` - -### 3.4 图查询与推理 - -#### 3.4.1 影响分析(Impact Analysis) - -```python -class ImpactAnalyzer: - """代码变更影响分析""" - - def __init__(self, graph_store: CodeGraphStore): - self.graph_store = graph_store - - def analyze_function_impact( - self, - function_id: str, - max_depth: int = 10 - ) -> Dict[str, any]: - """分析修改某个函数的影响范围""" - - # 找到所有直接和间接调用者 - affected_functions = set() - self._traverse_callers(function_id, affected_functions, 0, max_depth) - - # 找到所有受影响的文件 - affected_files = set() - for func_id in affected_functions: - node = self.graph_store.get_node(func_id) - if node: - affected_files.add(node.file_path) - - return { - 'modified_function': function_id, - 'affected_functions': list(affected_functions), - 'affected_files': list(affected_files), - 'impact_scope': len(affected_functions), - } - - def _traverse_callers( - self, - node_id: str, - result: Set[str], - current_depth: int, - max_depth: int - ): - """递归遍历调用者""" - - if current_depth >= max_depth or node_id in result: - return - - callers = self.graph_store.get_callers(node_id) - for caller in callers: - result.add(caller.node_id) - self._traverse_callers(caller.node_id, result, current_depth + 1, max_depth) -``` - -#### 3.4.2 数据流追踪 - -```python -class DataFlowTracer: - """数据流路径追踪""" - - def __init__(self, graph_store: CodeGraphStore): - self.graph_store = graph_store - - def trace_variable_flow( - self, - variable_name: str, - start_function_id: str - ) -> List[Dict]: - """追踪变量的数据流""" - - # 查找所有使用该变量的边 - cursor = self.graph_store.conn.cursor() - cursor.execute(""" - SELECT * FROM code_edges - WHERE edge_type IN ('uses_variable', 'defines_variable', 'passes_data') - AND (source_id = ? OR target_id LIKE ?) - """, (start_function_id, f"%{variable_name}%")) - - flow_path = [] - for row in cursor.fetchall(): - edge = self._row_to_edge(row) - source = self.graph_store.get_node(edge.source_id) - target = self.graph_store.get_node(edge.target_id) - - flow_path.append({ - 'from': source.name if source else 'unknown', - 'to': target.name if target else 'unknown', - 'action': edge.edge_type.value, - 'context': edge.context, - }) - - return flow_path - - def find_crud_operations( - self, - entity_name: str - ) -> Dict[str, List[CodeNode]]: - """找到对某个实体的所有CRUD操作""" - - cursor = self.graph_store.conn.cursor() - - # 查找所有修改该实体的函数 - cursor.execute(""" - SELECT DISTINCT n.* FROM code_nodes n - JOIN code_edges e ON n.node_id = e.source_id - WHERE e.edge_type = 'modifies' - AND e.target_id LIKE ? - """, (f"%{entity_name}%",)) - - writers = [self._row_to_node(row) for row in cursor.fetchall()] - - # 查找所有读取该实体的函数 - cursor.execute(""" - SELECT DISTINCT n.* FROM code_nodes n - JOIN code_edges e ON n.node_id = e.source_id - WHERE e.edge_type = 'reads' - AND e.target_id LIKE ? - """, (f"%{entity_name}%",)) - - readers = [self._row_to_node(row) for row in cursor.fetchall()] - - return { - 'create': [w for w in writers if 'create' in w.name.lower()], - 'read': readers, - 'update': [w for w in writers if 'update' in w.name.lower()], - 'delete': [w for w in writers if 'delete' in w.name.lower()], - } -``` - -### 3.5 与语义搜索集成 - -#### 3.5.1 增强的搜索结果 - -```python -class GraphEnhancedSearchEngine: - """结合图谱的增强搜索""" - - def __init__( - self, - vector_search: VectorStore, - graph_store: CodeGraphStore - ): - self.vector_search = vector_search - self.graph_store = graph_store - - def search_with_graph_context( - self, - query: str, - top_k: int = 10 - ) -> List[EnhancedSearchResult]: - """带图谱上下文的搜索""" - - # 1. 向量搜索 - vector_results = self.vector_search.search(query, top_k=top_k) - - # 2. 为每个结果添加图谱信息 - enhanced_results = [] - for result in vector_results: - # 找到对应的图节点 - node = self.graph_store.get_node(result.path) - if not node: - continue - - # 获取调用关系 - callers = self.graph_store.get_callers(node.node_id) - callees = self.graph_store.get_callees(node.node_id) - - enhanced = EnhancedSearchResult( - **result.dict(), - callers=[c.name for c in callers[:5]], - callees=[c.name for c in callees[:5]], - call_count_in=len(callers), - call_count_out=len(callees), - ) - enhanced_results.append(enhanced) - - return enhanced_results - - def search_by_relationship( - self, - query: str, - relationship_type: str # "calls", "called_by", "uses", etc. - ) -> List[CodeNode]: - """基于关系的搜索""" - - # 先找到查询匹配的节点 - vector_results = self.vector_search.search(query, top_k=5) - if not vector_results: - return [] - - target_node_id = vector_results[0].path - - # 根据关系类型查找相关节点 - if relationship_type == "calls": - return self.graph_store.get_callees(target_node_id) - elif relationship_type == "called_by": - return self.graph_store.get_callers(target_node_id) - # 其他关系类型... - - return [] -``` - -## 4. 实施路线图 - -### Phase 1: 基础静态分析(3-4周) -- [ ] 实现ASTAnalyzer(符号提取) -- [ ] 实现CallGraphExtractor(调用图提取) -- [ ] 实现NameResolver(名称解析) -- [ ] 设计图数据库schema -- [ ] 实现CodeGraphStore(基础CRUD) -- [ ] 单元测试 - -### Phase 2: 多语言支持(2周) -- [ ] Python完整支持 -- [ ] JavaScript/TypeScript支持 -- [ ] Java支持(可选) -- [ ] 跨语言测试 - -### Phase 3: LLM语义增强(2-3周) -- [ ] 实现RelationshipSemanticAnalyzer -- [ ] 实现DataModelRecognizer -- [ ] 批量处理优化 -- [ ] 集成测试 - -### Phase 4: 高级查询(2周) -- [ ] 实现ImpactAnalyzer -- [ ] 实现DataFlowTracer -- [ ] 实现GraphEnhancedSearchEngine -- [ ] 性能优化 - -### Phase 5: 可视化与工具(2周) -- [ ] 调用图可视化(Graphviz/D3.js) -- [ ] CLI命令集成 -- [ ] Web UI(可选) - -### Phase 6: 生产化(1-2周) -- [ ] 增量更新机制 -- [ ] 大规模项目优化 -- [ ] 文档和示例 -- [ ] 发布 - -**总计预估时间**:12-15周 - -## 5. 技术挑战与解决方案 - -### 5.1 挑战:跨文件名称解析 - -**问题**:函数调用的目标可能在不同文件/模块中,需要解析import语句。 - -**解决方案**: -```python -class ImportResolver: - """导入语句解析器""" - - def extract_imports(self, tree, content: str) -> Dict[str, str]: - """ - 提取所有import语句,构建别名映射 - - 返回: {别名 -> 实际模块路径} - """ - imports = {} - - for node in tree.root_node.children: - if node.type == 'import_statement': - # from module import func - # import module as alias - pass # 解析逻辑 - - return imports - - def resolve_imported_symbol( - self, - symbol_name: str, - imports: Dict[str, str], - project_root: Path - ) -> Optional[str]: - """解析导入符号的实际位置""" - - if symbol_name in imports: - module_path = imports[symbol_name] - # 查找该模块的文件路径 - # 在图谱中查找对应的节点 - pass - - return None -``` - -### 5.2 挑战:动态调用识别 - -**问题**:反射、getattr、动态导入等运行时行为无法通过静态分析完全捕获。 - -**解决方案**: -- 使用LLM推断可能的调用目标 -- 标记为"动态调用",降低置信度 -- 结合运行时日志补充 - -```python -def handle_dynamic_call(edge: CodeEdge) -> CodeEdge: - """处理动态调用""" - - if 'getattr' in edge.context or 'eval' in edge.context: - edge.confidence = 0.5 - edge.semantic_intent = "dynamic call (runtime resolution required)" - - return edge -``` - -### 5.3 挑战:大型代码库性能 - -**问题**:对百万行级别的代码库构建图谱可能耗时很长。 - -**解决方案**: -- **并行处理**:多进程分析不同文件 -- **增量更新**:只重新分析变更的文件 -- **延迟LLM**:初次构建只做静态分析,LLM增强按需触发 - -```python -from multiprocessing import Pool - -class ParallelGraphBuilder: - """并行图谱构建""" - - def build_graph_parallel( - self, - file_paths: List[Path], - workers: int = 8 - ): - """并行分析多个文件""" - - with Pool(workers) as pool: - results = pool.map(self._analyze_single_file, file_paths) - - # 合并结果到图谱 - for nodes, edges in results: - for node in nodes: - self.graph_store.add_node(node) - for edge in edges: - self.graph_store.add_edge(edge) -``` - -## 6. 成功指标 - -1. **覆盖率**:90%以上的函数调用关系被正确识别 -2. **准确率**:名称解析准确率>85% -3. **性能**:10万行代码的项目,图谱构建<5分钟 -4. **查询速度**:影响分析查询<100ms -5. **LLM增强价值**:关系语义描述的有用性评分>4/5 - -## 7. 应用场景示例 - -### 场景1:代码审查助手 -```python -# 审查一个PR,分析影响范围 -analyzer = ImpactAnalyzer(graph_store) -impact = analyzer.analyze_function_impact('auth.py:45:validate_token') - -print(f"修改此函数将影响 {impact['impact_scope']} 个其他函数") -print(f"涉及文件: {', '.join(impact['affected_files'])}") -``` - -### 场景2:重构规划 -```python -# 计划重构User类,查看所有相关操作 -tracer = DataFlowTracer(graph_store) -crud = tracer.find_crud_operations('User') - -print(f"创建User的方法: {[f.name for f in crud['create']]}") -print(f"读取User的方法: {[f.name for f in crud['read']]}") -``` - -### 场景3:知识图谱问答 -```python -# "修改登录逻辑会影响哪些API端点?" -search_engine = GraphEnhancedSearchEngine(vector_store, graph_store) - -# 先找到登录函数 -login_func = search_engine.search("user login authentication")[0] - -# 追踪调用链 -analyzer = ImpactAnalyzer(graph_store) -impact = analyzer.analyze_function_impact(login_func.node_id) - -# 筛选出API端点 -api_endpoints = [ - f for f in impact['affected_functions'] - if '@app.route' in graph_store.get_node(f).modifiers -] -``` - -## 8. 参考资料 - -- [LLVM Call Graph](https://llvm.org/docs/CallGraph.html) -- [Sourcegraph - Code Intelligence](https://about.sourcegraph.com/) -- [CodeQL - Semantic Code Analysis](https://codeql.github.com/) -- [Neo4j Graph Database](https://neo4j.com/) -- Tree-sitter AST Queries diff --git a/codex-lens/docs/T6-CLI-Integration-Summary.md b/codex-lens/docs/T6-CLI-Integration-Summary.md deleted file mode 100644 index 9b3959b1..00000000 --- a/codex-lens/docs/T6-CLI-Integration-Summary.md +++ /dev/null @@ -1,248 +0,0 @@ -# T6: CLI Integration for Hybrid Search - Implementation Summary - -## Overview - -Successfully integrated hybrid search capabilities into the CodexLens CLI with user-configurable options, migration support, and enhanced status reporting. - -## Changes Made - -### 1. Search Command Enhancement (`commands.py`) - -**New `--mode` Parameter:** -- Replaced `--hybrid` and `--exact-only` flags with unified `--mode` parameter -- Supported modes: `exact`, `fuzzy`, `hybrid`, `vector` -- Default: `exact` (backward compatible) - -**Mode Validation:** -```python -valid_modes = ["exact", "fuzzy", "hybrid", "vector"] -if mode not in valid_modes: - # Error with helpful message -``` - -**Weights Configuration:** -- Accepts custom RRF weights via `--weights exact,fuzzy,vector` -- Example: `--weights 0.5,0.3,0.2` -- Automatic normalization if weights don't sum to 1.0 -- Validation for 3-value format - -**Mode Mapping to SearchOptions:** -```python -hybrid_mode = mode == "hybrid" -enable_fuzzy = mode in ["fuzzy", "hybrid"] - -options = SearchOptions( - hybrid_mode=hybrid_mode, - enable_fuzzy=enable_fuzzy, - hybrid_weights=hybrid_weights, -) -``` - -**Enhanced Output:** -- Shows search mode in status line -- Includes search source tags in verbose mode -- JSON output includes mode and source information - -### 2. Migrate Command (`commands.py`) - -**New Command for Dual-FTS Upgrade:** -```bash -codex-lens migrate [path] -``` - -**Features:** -- Upgrades all `_index.db` files to schema version 4 -- Shows progress bar with percentage complete -- Tracks: migrated, already up-to-date, errors -- Safe operation preserving all data -- Verbose mode shows per-database migration details - -**Progress Tracking:** -- Uses Rich progress bar with spinner -- Shows percentage and count (N/Total) -- Time elapsed indicator - -### 3. Status Command Enhancement (`commands.py`) - -**New Backend Status Display:** -``` -Search Backends: - Exact FTS: ✓ (unicode61) - Fuzzy FTS: ✓ (trigram) - Hybrid Search: ✓ (RRF fusion) - Vector Search: ✗ (future) -``` - -**Schema Version Detection:** -- Checks first available `_index.db` -- Reports schema version -- Detects dual FTS table presence - -**Feature Flags in JSON:** -```json -{ - "features": { - "exact_fts": true, - "fuzzy_fts": true, - "hybrid_search": true, - "vector_search": false - } -} -``` - -### 4. Output Rendering (`output.py`) - -**Verbose Mode Support:** -```python -render_search_results(results, verbose=True) -``` - -**Search Source Tags:** -- `[E]` - Exact FTS result -- `[F]` - Fuzzy FTS result -- `[V]` - Vector search result -- `[RRF]` - Fusion result - -**Enhanced Table:** -- New "Source" column in verbose mode -- Shows result origin for debugging -- Fusion scores visible - -## Usage Examples - -### 1. Search with Different Modes - -```bash -# Exact search (default) -codex-lens search "authentication" - -# Fuzzy search only -codex-lens search "authentication" --mode fuzzy - -# Hybrid search with RRF fusion -codex-lens search "authentication" --mode hybrid - -# Hybrid with custom weights -codex-lens search "authentication" --mode hybrid --weights 0.5,0.3,0.2 - -# Verbose mode shows source tags -codex-lens search "authentication" --mode hybrid -v -``` - -### 2. Migration - -```bash -# Migrate current project -codex-lens migrate - -# Migrate specific project with verbose output -codex-lens migrate /path/to/project -v - -# JSON output for automation -codex-lens migrate --json -``` - -### 3. Status Checking - -```bash -# Check backend availability -codex-lens status - -# JSON output with feature flags -codex-lens status --json -``` - -## Testing - -**Test Coverage:** -- ✅ Mode parameter validation (exact, fuzzy, hybrid, vector) -- ✅ Weights parsing and normalization -- ✅ Help text shows all modes -- ✅ Migrate command exists and accessible -- ✅ Status command shows backends -- ✅ Mode mapping to SearchOptions - -**Test Results:** -``` -11 passed in 2.27s -``` - -## Integration Points - -### With Phase 1 (Dual-FTS): -- Uses `search_fts_exact()` for exact mode -- Uses `search_fts_fuzzy()` for fuzzy mode -- Schema migration via `_apply_migrations()` - -### With Phase 2 (Hybrid Search): -- Calls `HybridSearchEngine` for hybrid mode -- Passes custom weights to RRF algorithm -- Displays fusion scores and source tags - -### With Existing CLI: -- Backward compatible (default mode=exact) -- Follows existing error handling patterns -- Uses Rich for progress and formatting -- Supports JSON output mode - -## Done Criteria Verification - -✅ **CLI search --mode exact uses only exact FTS table** -- Mode validation ensures correct backend selection -- `hybrid_mode=False, enable_fuzzy=False` for exact mode - -✅ **--mode fuzzy uses only fuzzy table** -- `hybrid_mode=False, enable_fuzzy=True` for fuzzy mode -- Single backend execution - -✅ **--mode hybrid fuses both** -- `hybrid_mode=True, enable_fuzzy=True` activates RRF fusion -- HybridSearchEngine coordinates parallel search - -✅ **Custom weights via --weights 0.5,0.3,0.2** -- Parses 3-value comma-separated format -- Validates and normalizes to sum=1.0 -- Passes to RRF algorithm - -✅ **Migration command completes Dual-FTS upgrade** -- Shows progress bar with percentage -- Tracks migration status per database -- Safe operation with error handling - -✅ **Search output shows [E], [F], [V] tags and fusion scores** -- Verbose mode displays Source column -- Tags extracted from `search_source` attribute -- Fusion scores shown in Score column - -## Files Modified - -1. `codex-lens/src/codexlens/cli/commands.py` - - Updated `search()` command with `--mode` parameter - - Added `migrate()` command - - Enhanced `status()` command - - Added DirIndexStore import - -2. `codex-lens/src/codexlens/cli/output.py` - - Updated `render_search_results()` with verbose mode - - Added source tag display logic - -3. `codex-lens/tests/test_cli_hybrid_search.py` (new) - - Comprehensive CLI integration tests - - Mode validation tests - - Weights parsing tests - - Command availability tests - -## Performance Impact - -- **Exact mode**: Same as before (no overhead) -- **Fuzzy mode**: Single FTS query (minimal overhead) -- **Hybrid mode**: Parallel execution (2x I/O, no sequential penalty) -- **Migration**: One-time operation, safe for large projects - -## Next Steps - -Users can now: -1. Run `codex-lens migrate` to upgrade existing indexes -2. Use `codex-lens search "query" --mode hybrid` for best results -3. Check `codex-lens status` to verify enabled features -4. Tune fusion weights for their use case via `--weights` diff --git a/codex-lens/docs/codex_mcp.md b/codex-lens/docs/codex_mcp.md deleted file mode 100644 index edce9f5b..00000000 --- a/codex-lens/docs/codex_mcp.md +++ /dev/null @@ -1,459 +0,0 @@ -MCP integration -mcp_servers -You can configure Codex to use MCP servers to give Codex access to external applications, resources, or services. - -Server configuration -STDIO -STDIO servers are MCP servers that you can launch directly via commands on your computer. - -# The top-level table name must be `mcp_servers` -# The sub-table name (`server-name` in this example) can be anything you would like. -[mcp_servers.server_name] -command = "npx" -# Optional -args = ["-y", "mcp-server"] -# Optional: propagate additional env vars to the MCP server. -# A default whitelist of env vars will be propagated to the MCP server. -# https://github.com/openai/codex/blob/main/codex-rs/rmcp-client/src/utils.rs#L82 -env = { "API_KEY" = "value" } -# or -[mcp_servers.server_name.env] -API_KEY = "value" -# Optional: Additional list of environment variables that will be whitelisted in the MCP server's environment. -env_vars = ["API_KEY2"] - -# Optional: cwd that the command will be run from -cwd = "/Users//code/my-server" -Streamable HTTP -Streamable HTTP servers enable Codex to talk to resources that are accessed via a http url (either on localhost or another domain). - -[mcp_servers.figma] -url = "https://mcp.figma.com/mcp" -# Optional environment variable containing a bearer token to use for auth -bearer_token_env_var = "ENV_VAR" -# Optional map of headers with hard-coded values. -http_headers = { "HEADER_NAME" = "HEADER_VALUE" } -# Optional map of headers whose values will be replaced with the environment variable. -env_http_headers = { "HEADER_NAME" = "ENV_VAR" } -Streamable HTTP connections always use the experimental Rust MCP client under the hood, so expect occasional rough edges. OAuth login flows are gated on the rmcp_client = true flag: - -[features] -rmcp_client = true -After enabling it, run codex mcp login when the server supports OAuth. - -Other configuration options -# Optional: override the default 10s startup timeout -startup_timeout_sec = 20 -# Optional: override the default 60s per-tool timeout -tool_timeout_sec = 30 -# Optional: disable a server without removing it -enabled = false -# Optional: only expose a subset of tools from this server -enabled_tools = ["search", "summarize"] -# Optional: hide specific tools (applied after `enabled_tools`, if set) -disabled_tools = ["search"] -When both enabled_tools and disabled_tools are specified, Codex first restricts the server to the allow-list and then removes any tools that appear in the deny-list. - -MCP CLI commands -# List all available commands -codex mcp --help - -# Add a server (env can be repeated; `--` separates the launcher command) -codex mcp add docs -- docs-server --port 4000 - -# List configured servers (pretty table or JSON) -codex mcp list -codex mcp list --json - -# Show one server (table or JSON) -codex mcp get docs -codex mcp get docs --json - -# Remove a server -codex mcp remove docs - -# Log in to a streamable HTTP server that supports oauth -codex mcp login SERVER_NAME - -# Log out from a streamable HTTP server that supports oauth -codex mcp logout SERVER_NAME -Examples of useful MCPs -There is an ever growing list of useful MCP servers that can be helpful while you are working with Codex. - -Some of the most common MCPs we've seen are: - -Context7 — connect to a wide range of up-to-date developer documentation -Figma Local and Remote - access to your Figma designs -Playwright - control and inspect a browser using Playwright -Chrome Developer Tools — control and inspect a Chrome browser -Sentry — access to your Sentry logs -GitHub — Control over your GitHub account beyond what git allows (like controlling PRs, issues, etc.) - - -# Example config.toml - -Use this example configuration as a starting point. For an explanation of each field and additional context, see [Configuration](./config.md). Copy the snippet below to `~/.codex/config.toml` and adjust values as needed. - -```toml -# Codex example configuration (config.toml) -# -# This file lists all keys Codex reads from config.toml, their default values, -# and concise explanations. Values here mirror the effective defaults compiled -# into the CLI. Adjust as needed. -# -# Notes -# - Root keys must appear before tables in TOML. -# - Optional keys that default to "unset" are shown commented out with notes. -# - MCP servers, profiles, and model providers are examples; remove or edit. - -################################################################################ -# Core Model Selection -################################################################################ - -# Primary model used by Codex. Default: "gpt-5.1-codex-max" on all platforms. -model = "gpt-5.1-codex-max" - -# Model used by the /review feature (code reviews). Default: "gpt-5.1-codex-max". -review_model = "gpt-5.1-codex-max" - -# Provider id selected from [model_providers]. Default: "openai". -model_provider = "openai" - -# Optional manual model metadata. When unset, Codex auto-detects from model. -# Uncomment to force values. -# model_context_window = 128000 # tokens; default: auto for model -# model_auto_compact_token_limit = 0 # disable/override auto; default: model family specific -# tool_output_token_limit = 10000 # tokens stored per tool output; default: 10000 for gpt-5.1-codex-max - -################################################################################ -# Reasoning & Verbosity (Responses API capable models) -################################################################################ - -# Reasoning effort: minimal | low | medium | high | xhigh (default: medium; xhigh on gpt-5.1-codex-max and gpt-5.2) -model_reasoning_effort = "medium" - -# Reasoning summary: auto | concise | detailed | none (default: auto) -model_reasoning_summary = "auto" - -# Text verbosity for GPT-5 family (Responses API): low | medium | high (default: medium) -model_verbosity = "medium" - -# Force-enable reasoning summaries for current model (default: false) -model_supports_reasoning_summaries = false - -# Force reasoning summary format: none | experimental (default: none) -model_reasoning_summary_format = "none" - -################################################################################ -# Instruction Overrides -################################################################################ - -# Additional user instructions appended after AGENTS.md. Default: unset. -# developer_instructions = "" - -# Optional legacy base instructions override (prefer AGENTS.md). Default: unset. -# instructions = "" - -# Inline override for the history compaction prompt. Default: unset. -# compact_prompt = "" - -# Override built-in base instructions with a file path. Default: unset. -# experimental_instructions_file = "/absolute/or/relative/path/to/instructions.txt" - -# Load the compact prompt override from a file. Default: unset. -# experimental_compact_prompt_file = "/absolute/or/relative/path/to/compact_prompt.txt" - -################################################################################ -# Approval & Sandbox -################################################################################ - -# When to ask for command approval: -# - untrusted: only known-safe read-only commands auto-run; others prompt -# - on-failure: auto-run in sandbox; prompt only on failure for escalation -# - on-request: model decides when to ask (default) -# - never: never prompt (risky) -approval_policy = "on-request" - -# Filesystem/network sandbox policy for tool calls: -# - read-only (default) -# - workspace-write -# - danger-full-access (no sandbox; extremely risky) -sandbox_mode = "read-only" - -# Extra settings used only when sandbox_mode = "workspace-write". -[sandbox_workspace_write] -# Additional writable roots beyond the workspace (cwd). Default: [] -writable_roots = [] -# Allow outbound network access inside the sandbox. Default: false -network_access = false -# Exclude $TMPDIR from writable roots. Default: false -exclude_tmpdir_env_var = false -# Exclude /tmp from writable roots. Default: false -exclude_slash_tmp = false - -################################################################################ -# Shell Environment Policy for spawned processes -################################################################################ - -[shell_environment_policy] -# inherit: all (default) | core | none -inherit = "all" -# Skip default excludes for names containing KEY/TOKEN (case-insensitive). Default: false -ignore_default_excludes = false -# Case-insensitive glob patterns to remove (e.g., "AWS_*", "AZURE_*"). Default: [] -exclude = [] -# Explicit key/value overrides (always win). Default: {} -set = {} -# Whitelist; if non-empty, keep only matching vars. Default: [] -include_only = [] -# Experimental: run via user shell profile. Default: false -experimental_use_profile = false - -################################################################################ -# History & File Opener -################################################################################ - -[history] -# save-all (default) | none -persistence = "save-all" -# Maximum bytes for history file; oldest entries are trimmed when exceeded. Example: 5242880 -# max_bytes = 0 - -# URI scheme for clickable citations: vscode (default) | vscode-insiders | windsurf | cursor | none -file_opener = "vscode" - -################################################################################ -# UI, Notifications, and Misc -################################################################################ - -[tui] -# Desktop notifications from the TUI: boolean or filtered list. Default: true -# Examples: false | ["agent-turn-complete", "approval-requested"] -notifications = false - -# Enables welcome/status/spinner animations. Default: true -animations = true - -# Suppress internal reasoning events from output. Default: false -hide_agent_reasoning = false - -# Show raw reasoning content when available. Default: false -show_raw_agent_reasoning = false - -# Disable burst-paste detection in the TUI. Default: false -disable_paste_burst = false - -# Track Windows onboarding acknowledgement (Windows only). Default: false -windows_wsl_setup_acknowledged = false - -# External notifier program (argv array). When unset: disabled. -# Example: notify = ["notify-send", "Codex"] -# notify = [ ] - -# In-product notices (mostly set automatically by Codex). -[notice] -# hide_full_access_warning = true -# hide_rate_limit_model_nudge = true - -################################################################################ -# Authentication & Login -################################################################################ - -# Where to persist CLI login credentials: file (default) | keyring | auto -cli_auth_credentials_store = "file" - -# Base URL for ChatGPT auth flow (not OpenAI API). Default: -chatgpt_base_url = "https://chatgpt.com/backend-api/" - -# Restrict ChatGPT login to a specific workspace id. Default: unset. -# forced_chatgpt_workspace_id = "" - -# Force login mechanism when Codex would normally auto-select. Default: unset. -# Allowed values: chatgpt | api -# forced_login_method = "chatgpt" - -# Preferred store for MCP OAuth credentials: auto (default) | file | keyring -mcp_oauth_credentials_store = "auto" - -################################################################################ -# Project Documentation Controls -################################################################################ - -# Max bytes from AGENTS.md to embed into first-turn instructions. Default: 32768 -project_doc_max_bytes = 32768 - -# Ordered fallbacks when AGENTS.md is missing at a directory level. Default: [] -project_doc_fallback_filenames = [] - -################################################################################ -# Tools (legacy toggles kept for compatibility) -################################################################################ - -[tools] -# Enable web search tool (alias: web_search_request). Default: false -web_search = false - -# Enable the view_image tool so the agent can attach local images. Default: true -view_image = true - -# (Alias accepted) You can also write: -# web_search_request = false - -################################################################################ -# Centralized Feature Flags (preferred) -################################################################################ - -[features] -# Leave this table empty to accept defaults. Set explicit booleans to opt in/out. -unified_exec = false -rmcp_client = false -apply_patch_freeform = false -view_image_tool = true -web_search_request = false -ghost_commit = false -enable_experimental_windows_sandbox = false -skills = false - -################################################################################ -# Experimental toggles (legacy; prefer [features]) -################################################################################ - -# Include apply_patch via freeform editing path (affects default tool set). Default: false -experimental_use_freeform_apply_patch = false - -# Define MCP servers under this table. Leave empty to disable. -[mcp_servers] - -# --- Example: STDIO transport --- -# [mcp_servers.docs] -# command = "docs-server" # required -# args = ["--port", "4000"] # optional -# env = { "API_KEY" = "value" } # optional key/value pairs copied as-is -# env_vars = ["ANOTHER_SECRET"] # optional: forward these from the parent env -# cwd = "/path/to/server" # optional working directory override -# startup_timeout_sec = 10.0 # optional; default 10.0 seconds -# # startup_timeout_ms = 10000 # optional alias for startup timeout (milliseconds) -# tool_timeout_sec = 60.0 # optional; default 60.0 seconds -# enabled_tools = ["search", "summarize"] # optional allow-list -# disabled_tools = ["slow-tool"] # optional deny-list (applied after allow-list) - -# --- Example: Streamable HTTP transport --- -# [mcp_servers.github] -# url = "https://github-mcp.example.com/mcp" # required -# bearer_token_env_var = "GITHUB_TOKEN" # optional; Authorization: Bearer -# http_headers = { "X-Example" = "value" } # optional static headers -# env_http_headers = { "X-Auth" = "AUTH_ENV" } # optional headers populated from env vars -# startup_timeout_sec = 10.0 # optional -# tool_timeout_sec = 60.0 # optional -# enabled_tools = ["list_issues"] # optional allow-list - -################################################################################ -# Model Providers (extend/override built-ins) -################################################################################ - -# Built-ins include: -# - openai (Responses API; requires login or OPENAI_API_KEY via auth flow) -# - oss (Chat Completions API; defaults to http://localhost:11434/v1) - -[model_providers] - -# --- Example: override OpenAI with explicit base URL or headers --- -# [model_providers.openai] -# name = "OpenAI" -# base_url = "https://api.openai.com/v1" # default if unset -# wire_api = "responses" # "responses" | "chat" (default varies) -# # requires_openai_auth = true # built-in OpenAI defaults to true -# # request_max_retries = 4 # default 4; max 100 -# # stream_max_retries = 5 # default 5; max 100 -# # stream_idle_timeout_ms = 300000 # default 300_000 (5m) -# # experimental_bearer_token = "sk-example" # optional dev-only direct bearer token -# # http_headers = { "X-Example" = "value" } -# # env_http_headers = { "OpenAI-Organization" = "OPENAI_ORGANIZATION", "OpenAI-Project" = "OPENAI_PROJECT" } - -# --- Example: Azure (Chat/Responses depending on endpoint) --- -# [model_providers.azure] -# name = "Azure" -# base_url = "https://YOUR_PROJECT_NAME.openai.azure.com/openai" -# wire_api = "responses" # or "chat" per endpoint -# query_params = { api-version = "2025-04-01-preview" } -# env_key = "AZURE_OPENAI_API_KEY" -# # env_key_instructions = "Set AZURE_OPENAI_API_KEY in your environment" - -# --- Example: Local OSS (e.g., Ollama-compatible) --- -# [model_providers.ollama] -# name = "Ollama" -# base_url = "http://localhost:11434/v1" -# wire_api = "chat" - -################################################################################ -# Profiles (named presets) -################################################################################ - -# Active profile name. When unset, no profile is applied. -# profile = "default" - -[profiles] - -# [profiles.default] -# model = "gpt-5.1-codex-max" -# model_provider = "openai" -# approval_policy = "on-request" -# sandbox_mode = "read-only" -# model_reasoning_effort = "medium" -# model_reasoning_summary = "auto" -# model_verbosity = "medium" -# chatgpt_base_url = "https://chatgpt.com/backend-api/" -# experimental_compact_prompt_file = "compact_prompt.txt" -# include_apply_patch_tool = false -# experimental_use_freeform_apply_patch = false -# tools_web_search = false -# tools_view_image = true -# features = { unified_exec = false } - -################################################################################ -# Projects (trust levels) -################################################################################ - -# Mark specific worktrees as trusted. Only "trusted" is recognized. -[projects] -# [projects."/absolute/path/to/project"] -# trust_level = "trusted" - -################################################################################ -# OpenTelemetry (OTEL) – disabled by default -################################################################################ - -[otel] -# Include user prompt text in logs. Default: false -log_user_prompt = false -# Environment label applied to telemetry. Default: "dev" -environment = "dev" -# Exporter: none (default) | otlp-http | otlp-grpc -exporter = "none" - -# Example OTLP/HTTP exporter configuration -# [otel.exporter."otlp-http"] -# endpoint = "https://otel.example.com/v1/logs" -# protocol = "binary" # "binary" | "json" - -# [otel.exporter."otlp-http".headers] -# "x-otlp-api-key" = "${OTLP_TOKEN}" - -# Example OTLP/gRPC exporter configuration -# [otel.exporter."otlp-grpc"] -# endpoint = "https://otel.example.com:4317", -# headers = { "x-otlp-meta" = "abc123" } - -# Example OTLP exporter with mutual TLS -# [otel.exporter."otlp-http"] -# endpoint = "https://otel.example.com/v1/logs" -# protocol = "binary" - -# [otel.exporter."otlp-http".headers] -# "x-otlp-api-key" = "${OTLP_TOKEN}" - -# [otel.exporter."otlp-http".tls] -# ca-certificate = "certs/otel-ca.pem" -# client-certificate = "/etc/codex/certs/client.pem" -# client-private-key = "/etc/codex/certs/client-key.pem" -``` \ No newline at end of file diff --git a/codex-lens/docs/test-quality-enhancements.md b/codex-lens/docs/test-quality-enhancements.md deleted file mode 100644 index 03853038..00000000 --- a/codex-lens/docs/test-quality-enhancements.md +++ /dev/null @@ -1,187 +0,0 @@ -# Test Quality Enhancements - Implementation Summary - -**Date**: 2025-12-16 -**Status**: ✅ Complete - All 4 recommendations implemented and passing - -## Overview - -Implemented all 4 test quality recommendations from Gemini's comprehensive analysis to enhance test coverage and robustness across the codex-lens test suite. - -## Recommendation 1: Verify True Fuzzy Matching ✅ - -**File**: `tests/test_dual_fts.py` -**Test Class**: `TestDualFTSPerformance` -**New Test**: `test_fuzzy_substring_matching` - -### Implementation -- Verifies trigram tokenizer enables partial token matching -- Tests that searching for "func" matches "function0", "function1", etc. -- Gracefully skips if trigram tokenizer unavailable -- Validates BM25 scoring for fuzzy results - -### Key Features -- Runtime detection of trigram support -- Validates substring matching capability -- Ensures proper score ordering (negative BM25) - -### Test Result -```bash -PASSED tests/test_dual_fts.py::TestDualFTSPerformance::test_fuzzy_substring_matching -``` - ---- - -## Recommendation 2: Enable Mocked Vector Search ✅ - -**File**: `tests/test_hybrid_search_e2e.py` -**Test Class**: `TestHybridSearchWithVectorMock` -**New Test**: `test_hybrid_with_vector_enabled` - -### Implementation -- Mocks vector search to return predefined results -- Tests RRF fusion with exact + fuzzy + vector sources -- Validates hybrid search handles vector integration correctly -- Uses `unittest.mock.patch` for clean mocking - -### Key Features -- Mock SearchResult objects with scores -- Tests enable_vector=True parameter -- Validates RRF fusion score calculation (positive scores) -- Gracefully handles missing vector search module - -### Test Result -```bash -PASSED tests/test_hybrid_search_e2e.py::TestHybridSearchWithVectorMock::test_hybrid_with_vector_enabled -``` - ---- - -## Recommendation 3: Complex Query Parser Stress Tests ✅ - -**File**: `tests/test_query_parser.py` -**Test Class**: `TestComplexBooleanQueries` -**New Tests**: 5 comprehensive tests - -### Implementation - -#### 1. `test_nested_boolean_and_or` -- Tests: `(login OR logout) AND user` -- Validates nested parentheses preservation -- Ensures boolean operators remain intact - -#### 2. `test_mixed_operators_with_expansion` -- Tests: `UserAuth AND (login OR logout)` -- Verifies CamelCase expansion doesn't break operators -- Ensures expansion + boolean logic coexist - -#### 3. `test_quoted_phrases_with_boolean` -- Tests: `"user authentication" AND login` -- Validates quoted phrase preservation -- Ensures AND operator survives - -#### 4. `test_not_operator_preservation` -- Tests: `login NOT logout` -- Confirms NOT operator handling -- Validates negation logic - -#### 5. `test_complex_nested_three_levels` -- Tests: `((UserAuth OR login) AND session) OR token` -- Stress tests deep nesting (3 levels) -- Validates multiple parentheses pairs - -### Test Results -```bash -PASSED tests/test_query_parser.py::TestComplexBooleanQueries::test_nested_boolean_and_or -PASSED tests/test_query_parser.py::TestComplexBooleanQueries::test_mixed_operators_with_expansion -PASSED tests/test_query_parser.py::TestComplexBooleanQueries::test_quoted_phrases_with_boolean -PASSED tests/test_query_parser.py::TestComplexBooleanQueries::test_not_operator_preservation -PASSED tests/test_query_parser.py::TestComplexBooleanQueries::test_complex_nested_three_levels -``` - ---- - -## Recommendation 4: Migration Reversibility Tests ✅ - -**File**: `tests/test_dual_fts.py` -**Test Class**: `TestMigrationRecovery` -**New Tests**: 2 migration robustness tests - -### Implementation - -#### 1. `test_migration_preserves_data_on_failure` -- Creates v2 database with test data -- Attempts migration (may succeed or fail) -- Validates data preservation in both scenarios -- Smart column detection (path vs full_path) - -**Key Features**: -- Checks schema version to determine column names -- Handles both migration success and failure -- Ensures no data loss - -#### 2. `test_migration_idempotent_after_partial_failure` -- Tests retry capability after partial migration -- Validates graceful handling of repeated initialization -- Ensures database remains in usable state - -**Key Features**: -- Double initialization without errors -- Table existence verification -- Safe retry mechanism - -### Test Results -```bash -PASSED tests/test_dual_fts.py::TestMigrationRecovery::test_migration_preserves_data_on_failure -PASSED tests/test_dual_fts.py::TestMigrationRecovery::test_migration_idempotent_after_partial_failure -``` - ---- - -## Test Suite Statistics - -### Overall Results -``` -91 passed, 2 skipped, 2 warnings in 3.31s -``` - -### New Tests Added -- **Recommendation 1**: 1 test (fuzzy substring matching) -- **Recommendation 2**: 1 test (vector mock integration) -- **Recommendation 3**: 5 tests (complex boolean queries) -- **Recommendation 4**: 2 tests (migration recovery) - -**Total New Tests**: 9 - -### Coverage Improvements -- **Fuzzy Search**: Now validates actual trigram substring matching -- **Hybrid Search**: Tests vector integration with mocks -- **Query Parser**: Handles complex nested boolean logic -- **Migration**: Validates data preservation and retry capability - ---- - -## Code Quality - -### Best Practices Applied -1. **Graceful Degradation**: Tests skip when features unavailable (trigram) -2. **Clean Mocking**: Uses `unittest.mock` for vector search -3. **Smart Assertions**: Adapts to migration outcomes dynamically -4. **Edge Case Handling**: Tests multiple nesting levels and operators - -### Integration -- All tests integrate seamlessly with existing pytest fixtures -- Maintains 100% pass rate across test suite -- No breaking changes to existing tests - ---- - -## Validation - -All 4 recommendations successfully implemented and verified: - -✅ **Recommendation 1**: Fuzzy substring matching with trigram validation -✅ **Recommendation 2**: Vector search mocking for hybrid fusion testing -✅ **Recommendation 3**: Complex boolean query stress tests (5 tests) -✅ **Recommendation 4**: Migration recovery and idempotency tests (2 tests) - -**Final Status**: Production-ready, all tests passing diff --git a/codex-lens/examples/association_tree_demo.py b/codex-lens/examples/association_tree_demo.py deleted file mode 100644 index 719f9383..00000000 --- a/codex-lens/examples/association_tree_demo.py +++ /dev/null @@ -1,156 +0,0 @@ -"""Demo script for association tree building. - -This script demonstrates how to use the AssociationTreeBuilder and -ResultDeduplicator to explore code relationships via LSP call hierarchy. -""" - -import asyncio -import sys -from pathlib import Path - -# Add parent directory to path for imports -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from codexlens.lsp.standalone_manager import StandaloneLspManager -from codexlens.search.association_tree import ( - AssociationTreeBuilder, - ResultDeduplicator, -) - - -async def demo_simple_tree(): - """Build a simple call tree from a Python file.""" - print("=" * 70) - print("Association Tree Demo") - print("=" * 70) - print() - - # Use this file as the test subject - test_file = Path(__file__).resolve() - workspace_root = test_file.parent.parent - - print(f"Workspace: {workspace_root}") - print(f"Test file: {test_file.name}") - print() - - # Initialize LSP manager - async with StandaloneLspManager( - workspace_root=str(workspace_root), - timeout=10.0, - ) as lsp: - print("LSP manager initialized") - print() - - # Create tree builder - builder = AssociationTreeBuilder(lsp, timeout=5.0) - - # Build tree from a function in this file - # Using line 50 as an example (adjust based on actual file) - print(f"Building call tree from {test_file.name}:50...") - tree = await builder.build_tree( - seed_file_path=str(test_file), - seed_line=50, - seed_character=1, - max_depth=3, - expand_callers=True, - expand_callees=True, - ) - - print(f"Tree built: {tree}") - print(f" Roots: {len(tree.roots)}") - print(f" Total unique nodes: {len(tree.all_nodes)}") - print(f" Total node instances: {len(tree.node_list)}") - print(f" Edges: {len(tree.edges)}") - print() - - if tree.roots: - print("Root nodes:") - for root in tree.roots: - print(f" - {root.item.name} ({root.item.kind})") - print(f" {root.item.file_path}:{root.item.range.start_line}") - print() - - # Deduplicate and score - print("Deduplicating and scoring nodes...") - deduplicator = ResultDeduplicator( - depth_weight=0.4, - frequency_weight=0.3, - kind_weight=0.3, - ) - - unique_nodes = deduplicator.deduplicate(tree, max_results=20) - print(f"Found {len(unique_nodes)} unique nodes") - print() - - if unique_nodes: - print("Top 10 nodes by score:") - print("-" * 70) - for i, node in enumerate(unique_nodes[:10], 1): - print(f"{i:2}. {node.name} ({node.kind})") - print(f" Location: {Path(node.file_path).name}:{node.range.start_line}") - print( - f" Depth: {node.min_depth}, " - f"Occurrences: {node.occurrences}, " - f"Score: {node.score:.3f}" - ) - if node.paths: - print(f" Paths: {len(node.paths)}") - print() - - # Show filtering capabilities - functions = deduplicator.filter_by_kind( - unique_nodes, ["function", "method"] - ) - print(f"Functions/methods only: {len(functions)} nodes") - - if functions: - print("Top 5 functions:") - for i, node in enumerate(functions[:5], 1): - print(f" {i}. {node.name} (score: {node.score:.3f})") - - else: - print("No nodes found. Try a different seed location.") - - print() - print("Demo complete!") - - -async def demo_cycle_detection(): - """Demonstrate cycle detection in call trees.""" - print("\n" + "=" * 70) - print("Cycle Detection Demo") - print("=" * 70) - print() - - # Create a simple Python file with circular calls for testing - test_code = ''' -def func_a(): - """Function A calls B.""" - func_b() - -def func_b(): - """Function B calls A (creates a cycle).""" - func_a() -''' - - print("This demo would detect cycles in:") - print(test_code) - print("The tree builder automatically marks cycle nodes to prevent infinite expansion.") - - -def main(): - """Run the demo.""" - try: - asyncio.run(demo_simple_tree()) - demo_cycle_detection() - except KeyboardInterrupt: - print("\nDemo interrupted by user") - except Exception as e: - print(f"\nError running demo: {e}") - import traceback - - traceback.print_exc() - - -if __name__ == "__main__": - main() diff --git a/codex-lens/examples/debug_uri_format.py b/codex-lens/examples/debug_uri_format.py deleted file mode 100644 index 4c1c965f..00000000 --- a/codex-lens/examples/debug_uri_format.py +++ /dev/null @@ -1,40 +0,0 @@ -"""Debug URI format issues.""" - -import asyncio -from pathlib import Path -from urllib.parse import quote - -def test_uri_formats(): - """Compare different URI formats.""" - file_path = Path("D:/Claude_dms3/codex-lens/test_simple_function.py") - - print("URI Format Comparison") - print("="*80) - - # Method 1: Path.as_uri() - uri1 = file_path.resolve().as_uri() - print(f"1. Path.as_uri(): {uri1}") - - # Method 2: Manual construction - uri2 = f"file:///{str(file_path.resolve()).replace(chr(92), '/')}" - print(f"2. Manual (forward /): {uri2}") - - # Method 3: With quote - path_str = str(file_path.resolve()).replace(chr(92), '/') - uri3 = f"file:///{quote(path_str, safe='/:')}" - print(f"3. With quote: {uri3}") - - # Method 4: Lowercase drive - path_lower = str(file_path.resolve()).replace(chr(92), '/') - if len(path_lower) > 1 and path_lower[1] == ':': - path_lower = path_lower[0].lower() + path_lower[1:] - uri4 = f"file:///{path_lower}" - print(f"4. Lowercase drive: {uri4}") - - # What Pyright shows in logs - print(f"\n5. Pyright log format: file:///d%3A/Claude_dms3/codex-lens/...") - - return uri1, uri4 - -if __name__ == "__main__": - test_uri_formats() diff --git a/codex-lens/examples/search_comparison_benchmark.py b/codex-lens/examples/search_comparison_benchmark.py deleted file mode 100644 index 88029b61..00000000 --- a/codex-lens/examples/search_comparison_benchmark.py +++ /dev/null @@ -1,326 +0,0 @@ -"""Search method comparison benchmark. - -Compares different search strategies: -1. Pure FTS (exact + fuzzy matching) -2. Pure Vector (semantic search only) -3. Hybrid Fusion (FTS + Vector with RRF) -4. Vector + LSP Association Tree (new strategy) - -Usage: - python examples/search_comparison_benchmark.py -""" - -from __future__ import annotations - -import asyncio -import time -from pathlib import Path -from typing import List, Dict, Any - -from codexlens.config import Config -from codexlens.entities import SearchResult -from codexlens.search.hybrid_search import HybridSearchEngine -from codexlens.lsp.standalone_manager import StandaloneLspManager -from codexlens.search.association_tree import AssociationTreeBuilder, ResultDeduplicator - - -class SearchBenchmark: - """Benchmark different search strategies.""" - - def __init__(self, index_path: Path, config: Config): - """Initialize benchmark. - - Args: - index_path: Path to _index.db file - config: CodexLens config - """ - self.index_path = index_path - self.config = config - self.engine = HybridSearchEngine(config=config) - self.lsp_manager: StandaloneLspManager | None = None - self.tree_builder: AssociationTreeBuilder | None = None - self.deduplicator = ResultDeduplicator( - depth_weight=0.4, - frequency_weight=0.3, - kind_weight=0.3, - max_depth_penalty=10, - ) - - async def setup_lsp(self): - """Setup LSP manager for association tree search.""" - self.lsp_manager = StandaloneLspManager( - workspace_root=str(self.index_path.parent), - timeout=5.0, - ) - await self.lsp_manager.start() - self.tree_builder = AssociationTreeBuilder( - lsp_manager=self.lsp_manager, - timeout=5.0, - ) - - async def cleanup_lsp(self): - """Cleanup LSP manager.""" - if self.lsp_manager: - await self.lsp_manager.stop() - - def method1_pure_fts(self, query: str, limit: int = 20) -> tuple[List[SearchResult], float]: - """Method 1: Pure FTS (exact + fuzzy).""" - start = time.perf_counter() - results = self.engine.search( - index_path=self.index_path, - query=query, - limit=limit, - enable_fuzzy=True, - enable_vector=False, - pure_vector=False, - ) - elapsed = time.perf_counter() - start - return results, elapsed - - def method2_pure_vector(self, query: str, limit: int = 20) -> tuple[List[SearchResult], float]: - """Method 2: Pure Vector (semantic search only).""" - start = time.perf_counter() - results = self.engine.search( - index_path=self.index_path, - query=query, - limit=limit, - enable_fuzzy=False, - enable_vector=True, - pure_vector=True, - ) - elapsed = time.perf_counter() - start - return results, elapsed - - def method3_hybrid_fusion(self, query: str, limit: int = 20) -> tuple[List[SearchResult], float]: - """Method 3: Hybrid Fusion (FTS + Vector with RRF).""" - start = time.perf_counter() - results = self.engine.search( - index_path=self.index_path, - query=query, - limit=limit, - enable_fuzzy=True, - enable_vector=True, - pure_vector=False, - ) - elapsed = time.perf_counter() - start - return results, elapsed - - async def method4_vector_lsp_tree( - self, - query: str, - limit: int = 20, - max_depth: int = 3, - expand_callers: bool = True, - expand_callees: bool = True, - ) -> tuple[List[SearchResult], float, Dict[str, Any]]: - """Method 4: Vector + LSP Association Tree (new strategy). - - Steps: - 1. Vector search to find seed results (top 5-10) - 2. For each seed, build LSP association tree - 3. Deduplicate and score all discovered nodes - 4. Return top N results - - Args: - query: Search query - limit: Final result limit - max_depth: Maximum depth for LSP tree expansion - expand_callers: Whether to expand incoming calls - expand_callees: Whether to expand outgoing calls - - Returns: - Tuple of (results, elapsed_time, stats) - """ - if not self.tree_builder: - raise RuntimeError("LSP not initialized. Call setup_lsp() first.") - - start = time.perf_counter() - stats = { - "seed_count": 0, - "trees_built": 0, - "total_tree_nodes": 0, - "unique_nodes": 0, - "dedup_time_ms": 0, - } - - # Step 1: Get seed results from vector search (top 10) - seed_results = self.engine.search( - index_path=self.index_path, - query=query, - limit=10, - enable_fuzzy=False, - enable_vector=True, - pure_vector=True, - ) - stats["seed_count"] = len(seed_results) - - if not seed_results: - return [], time.perf_counter() - start, stats - - # Step 2: Build association trees for each seed - all_trees = [] - for seed in seed_results: - try: - tree = await self.tree_builder.build_tree( - seed_file_path=seed.path, - seed_line=seed.start_line or 1, - seed_character=1, - max_depth=max_depth, - expand_callers=expand_callers, - expand_callees=expand_callees, - ) - if tree.node_list: - all_trees.append(tree) - stats["trees_built"] += 1 - stats["total_tree_nodes"] += len(tree.node_list) - except Exception as e: - print(f"Error building tree for {seed.path}:{seed.start_line}: {e}") - continue - - if not all_trees: - # Fallback to seed results if no trees built - return seed_results[:limit], time.perf_counter() - start, stats - - # Step 3: Merge and deduplicate all trees - dedup_start = time.perf_counter() - - # Merge all node_lists into a single CallTree - from codexlens.search.association_tree.data_structures import CallTree - merged_tree = CallTree() - for tree in all_trees: - merged_tree.node_list.extend(tree.node_list) - - # Deduplicate - unique_nodes = self.deduplicator.deduplicate( - tree=merged_tree, - max_results=limit, - ) - stats["unique_nodes"] = len(unique_nodes) - stats["dedup_time_ms"] = (time.perf_counter() - dedup_start) * 1000 - - # Step 4: Convert UniqueNode to SearchResult - results = [] - for node in unique_nodes: - # Use node.score as the search score - result = SearchResult( - path=node.file_path, - score=node.score, - start_line=node.range.start_line, - end_line=node.range.end_line, - symbol_name=node.name, - symbol_kind=node.kind, - content="", # LSP doesn't provide content - metadata={"search_source": "lsp_tree"}, - ) - results.append(result) - - elapsed = time.perf_counter() - start - return results, elapsed, stats - - def print_results(self, method_name: str, results: List[SearchResult], elapsed: float, stats: Dict[str, Any] | None = None): - """Print benchmark results.""" - print(f"\n{'='*80}") - print(f"Method: {method_name}") - print(f"{'='*80}") - print(f"Time: {elapsed*1000:.2f}ms") - print(f"Results: {len(results)}") - - if stats: - print(f"\nStats:") - for key, value in stats.items(): - print(f" {key}: {value}") - - print(f"\nTop 5 Results:") - for i, result in enumerate(results[:5], 1): - print(f"{i}. [{result.score:.4f}] {result.path}:{result.start_line}") - if result.symbol_name: - print(f" Name: {result.symbol_name}") - if result.metadata.get("search_source"): - print(f" Source: {result.metadata.get('search_source')}") - - async def run_comparison(self, query: str, limit: int = 20): - """Run comparison for a single query.""" - print(f"\n{'#'*80}") - print(f"Query: {query}") - print(f"{'#'*80}") - - # Method 1: Pure FTS - results1, time1 = self.method1_pure_fts(query, limit) - self.print_results("Method 1: Pure FTS", results1, time1) - - # Method 2: Pure Vector - results2, time2 = self.method2_pure_vector(query, limit) - self.print_results("Method 2: Pure Vector", results2, time2) - - # Method 3: Hybrid Fusion - results3, time3 = self.method3_hybrid_fusion(query, limit) - self.print_results("Method 3: Hybrid Fusion (FTS+Vector)", results3, time3) - - # Method 4: Vector + LSP Tree (requires LSP setup) - results4 = None - time4 = 0.0 - try: - results4, time4, stats4 = await self.method4_vector_lsp_tree(query, limit, max_depth=3) - self.print_results("Method 4: Vector + LSP Association Tree", results4, time4, stats4) - except Exception as e: - print(f"\nMethod 4: Vector + LSP Association Tree") - print(f"Error: {e}") - - # Comparison summary - print(f"\n{'='*80}") - print(f"Summary") - print(f"{'='*80}") - print(f"Method 1 (FTS): {time1*1000:8.2f}ms {len(results1):3d} results") - print(f"Method 2 (Vector): {time2*1000:8.2f}ms {len(results2):3d} results") - print(f"Method 3 (Hybrid): {time3*1000:8.2f}ms {len(results3):3d} results") - if results4 is not None: - print(f"Method 4 (Vector+LSP): {time4*1000:8.2f}ms {len(results4):3d} results") - - -async def main(): - """Main benchmark entry point.""" - # Setup - use the actual index path from ~/.codexlens/indexes/ - import os - codexlens_home = Path(os.path.expanduser("~/.codexlens")) - index_path = codexlens_home / "indexes/D/Claude_dms3/codex-lens/src/codexlens/_index.db" - - if not index_path.exists(): - print(f"Error: Index not found at {index_path}") - print("Please run: python -m codexlens index init src") - return - - project_root = Path("D:/Claude_dms3/codex-lens/src") - - config = Config() - benchmark = SearchBenchmark(index_path, config) - - # Test queries - queries = [ - "vector search implementation", - "LSP call hierarchy", - "search result ranking", - "index building", - ] - - # Setup LSP for Method 4 - print("Setting up LSP manager...") - try: - await benchmark.setup_lsp() - print("LSP manager ready") - except Exception as e: - print(f"Warning: Could not setup LSP: {e}") - print("Method 4 will be skipped") - - try: - # Run benchmarks - for query in queries: - await benchmark.run_comparison(query, limit=20) - - finally: - # Cleanup - await benchmark.cleanup_lsp() - print("\nBenchmark complete") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/codex-lens/examples/simple_search_comparison.py b/codex-lens/examples/simple_search_comparison.py deleted file mode 100644 index 3fdbeaee..00000000 --- a/codex-lens/examples/simple_search_comparison.py +++ /dev/null @@ -1,123 +0,0 @@ -"""Simple search method comparison using CLI commands. - -Compares: -1. FTS (Full-Text Search) -2. Semantic (Dense + Rerank) -3. Hybrid (Future: FTS + Semantic fusion) - -Usage: - python examples/simple_search_comparison.py -""" - -import subprocess -import time -import json -import re -import os -from pathlib import Path - -def strip_ansi(text: str) -> str: - """Remove ANSI color codes from text.""" - ansi_escape = re.compile(r'\x1b\[[0-9;]*m') - return ansi_escape.sub('', text) - -def run_search(query: str, method: str, limit: int = 20) -> tuple[list, float]: - """Run search via CLI and measure time.""" - cmd = [ - "python", "-m", "codexlens", "search", - query, - "--method", method, - "--limit", str(limit), - "--json", - "-p", "." - ] - - start = time.perf_counter() - result = subprocess.run( - cmd, - cwd=str(Path("D:/Claude_dms3/codex-lens/src")), - capture_output=True, - text=True, - env={**os.environ, "NO_COLOR": "1"}, # Try to disable colors - ) - elapsed = time.perf_counter() - start - - if result.returncode != 0: - print(f"Error running {method} search:") - print(result.stderr[:200]) - return [], elapsed - - try: - # Strip ANSI codes and parse JSON - clean_output = strip_ansi(result.stdout) - data = json.loads(clean_output) - # Results are nested in "result" object - if "result" in data and "results" in data["result"]: - return data["result"]["results"], elapsed - return data.get("results", []), elapsed - except json.JSONDecodeError as e: - print(f"Failed to parse JSON output for {method}: {e}") - return [], elapsed - - -def print_comparison(query: str): - """Print comparison for a single query.""" - print(f"\n{'='*80}") - print(f"Query: {query}") - print(f"{'='*80}\n") - - # Method 1: FTS - print("Method 1: FTS (Full-Text Search)") - results_fts, time_fts = run_search(query, "fts", 20) - print(f" Time: {time_fts*1000:.2f}ms") - print(f" Results: {len(results_fts)}") - if results_fts: - print(f" Top 3:") - for i, r in enumerate(results_fts[:3], 1): - path = r.get("path", "").replace("D:\\Claude_dms3\\codex-lens\\src\\", "") - score = r.get("score", 0) - print(f" {i}. [{score:.4f}] {path}") - print() - - # Method 2: Semantic (Dense + Rerank) - print("Method 2: Semantic (Dense + Rerank)") - results_semantic, time_semantic = run_search(query, "dense_rerank", 20) - print(f" Time: {time_semantic*1000:.2f}ms") - print(f" Results: {len(results_semantic)}") - if results_semantic: - print(f" Top 3:") - for i, r in enumerate(results_semantic[:3], 1): - path = r.get("path", "").replace("D:\\Claude_dms3\\codex-lens\\src\\", "") - score = r.get("score", 0) - print(f" {i}. [{score:.4f}] {path}") - print() - - # Summary - print(f"Summary:") - print(f" FTS: {time_fts*1000:8.2f}ms {len(results_fts):3d} results") - print(f" Semantic: {time_semantic*1000:8.2f}ms {len(results_semantic):3d} results") - print(f" Speedup: {time_semantic/time_fts:6.2f}x (FTS faster)") - - -def main(): - """Main comparison entry point.""" - queries = [ - "vector search", - "LSP call hierarchy", - "search ranking", - "index building", - ] - - print("Search Method Comparison") - print("=" * 80) - - for query in queries: - print_comparison(query) - - print(f"\n{'='*80}") - print("Comparison complete") - print(f"{'='*80}") - - -if __name__ == "__main__": - main() diff --git a/codex-lens/examples/test_lsp_capabilities.py b/codex-lens/examples/test_lsp_capabilities.py deleted file mode 100644 index a8ea4c51..00000000 --- a/codex-lens/examples/test_lsp_capabilities.py +++ /dev/null @@ -1,79 +0,0 @@ -"""Test LSP server capabilities.""" - -import asyncio -import json -from pathlib import Path -from codexlens.lsp.standalone_manager import StandaloneLspManager - -async def test_capabilities(): - """Test what capabilities Pyright provides.""" - - workspace_root = Path("D:/Claude_dms3/codex-lens/src") - - print("Testing LSP Capabilities") - print("="*80) - - # Create LSP manager - manager = StandaloneLspManager( - workspace_root=str(workspace_root), - timeout=10.0, - ) - - try: - # Start LSP manager - print("\n1. Starting LSP manager...") - await manager.start() - print(" [OK] LSP manager started") - - # Get server state for Python - print("\n2. Getting Python server state...") - test_file = str(workspace_root / "codexlens/search/hybrid_search.py") - state = await manager._get_server(test_file) - - if not state: - print(" [ERROR] Could not get server state!") - return - - print(f" [OK] Server state obtained") - print(f" Initialized: {state.initialized}") - - # Print capabilities - print("\n3. Server Capabilities:") - print("-"*80) - caps = state.capabilities - - # Key capabilities to check - important_caps = [ - "callHierarchyProvider", - "definitionProvider", - "referencesProvider", - "documentSymbolProvider", - "workspaceSymbolProvider", - "hoverProvider", - "completionProvider", - "signatureHelpProvider", - ] - - for cap in important_caps: - value = caps.get(cap) - status = "[YES]" if value else "[NO]" - print(f" {status} {cap}: {value}") - - # Print all capabilities as JSON for reference - print("\n4. Full capabilities (formatted):") - print("-"*80) - print(json.dumps(caps, indent=2)) - - except Exception as e: - print(f"\n[ERROR] Error: {e}") - import traceback - traceback.print_exc() - - finally: - # Cleanup - print("\n5. Cleaning up...") - await manager.stop() - print(" [OK] LSP manager stopped") - -if __name__ == "__main__": - asyncio.run(test_capabilities()) diff --git a/codex-lens/examples/test_lsp_references.py b/codex-lens/examples/test_lsp_references.py deleted file mode 100644 index 2ce470af..00000000 --- a/codex-lens/examples/test_lsp_references.py +++ /dev/null @@ -1,76 +0,0 @@ -"""Test LSP references as alternative to call hierarchy.""" - -import asyncio -from pathlib import Path -from codexlens.lsp.standalone_manager import StandaloneLspManager - -async def test_references(): - """Test using references as alternative to call hierarchy.""" - - workspace_root = Path("D:/Claude_dms3/codex-lens") - test_file = workspace_root / "test_simple_function.py" - - print("Testing LSP References (Alternative)") - print("="*80) - - manager = StandaloneLspManager( - workspace_root=str(workspace_root), - timeout=30.0, - ) - - try: - print("\n1. Starting LSP manager...") - await manager.start() - print(" [OK] Started") - - # Wait for analysis - await asyncio.sleep(2) - - # Test references for hello_world function - print("\n2. Testing references for 'hello_world' (line 4)...") - refs = await manager.get_references( - file_path=str(test_file), - line=4, - character=5, - include_declaration=True, - ) - print(f" Found: {len(refs)} references") - for ref in refs[:5]: - uri = ref.get('uri', '') - range_obj = ref.get('range', {}) - start = range_obj.get('start', {}) - print(f" - {uri.split('/')[-1]}:{start.get('line', 0)+1}") - - # Test definition - print("\n3. Testing definition for 'hello_world' call (line 13)...") - defs = await manager.get_definition( - file_path=str(test_file), - line=13, - character=11, - ) - print(f" Found: {len(defs)} definitions") - for d in defs: - uri = d.get('uri', '') - range_obj = d.get('range', {}) - start = range_obj.get('start', {}) - print(f" - {uri.split('/')[-1]}:{start.get('line', 0)+1}") - - # Test document symbols - print("\n4. Testing document symbols...") - symbols = await manager.get_document_symbols(str(test_file)) - print(f" Found: {len(symbols)} symbols") - for sym in symbols: - print(f" - {sym.get('name')} ({sym.get('kind')})") - - except Exception as e: - print(f"\n[ERROR] {e}") - import traceback - traceback.print_exc() - - finally: - print("\n5. Cleanup...") - await manager.stop() - print(" [OK] Done") - -if __name__ == "__main__": - asyncio.run(test_references()) diff --git a/codex-lens/examples/test_lsp_tree.py b/codex-lens/examples/test_lsp_tree.py deleted file mode 100644 index 30b593ef..00000000 --- a/codex-lens/examples/test_lsp_tree.py +++ /dev/null @@ -1,92 +0,0 @@ -"""Test LSP Association Tree building directly.""" - -import asyncio -from pathlib import Path -from codexlens.lsp.standalone_manager import StandaloneLspManager -from codexlens.search.association_tree import AssociationTreeBuilder - -async def test_lsp_tree(): - """Test building LSP association tree for a known Python file.""" - - # Setup - use simple test file - workspace_root = Path("D:/Claude_dms3/codex-lens") - test_file = "test_simple_function.py" - test_line = 11 # main() function definition (1-based) - test_char = 5 # Points to 'm' in 'main' (1-based, becomes 4 in 0-based) - - print(f"Testing LSP tree for: {test_file}:{test_line}") - print("="*80) - - # Create LSP manager - manager = StandaloneLspManager( - workspace_root=str(workspace_root), - timeout=10.0, - ) - - try: - # Start LSP manager - print("\n1. Starting LSP manager...") - await manager.start() - print(" [OK] LSP manager started") - - # Test get_call_hierarchy_items directly - print(f"\n2. Testing get_call_hierarchy_items for {test_file}:{test_line}:{test_char}...") - items = await manager.get_call_hierarchy_items( - file_path=str(workspace_root / test_file), - line=test_line, - character=test_char, - ) - print(f" Result: {len(items)} items") - if items: - for i, item in enumerate(items, 1): - print(f" {i}. {item.get('name')} ({item.get('kind')})") - print(f" URI: {item.get('uri')}") - print(f" Range: {item.get('range')}") - else: - print(" [WARN] No call hierarchy items returned!") - print(" This means either:") - print(" - The file/line doesn't contain a symbol") - print(" - LSP server doesn't support call hierarchy") - print(" - Pyright isn't running correctly") - - # If we got items, try building a tree - if items: - print(f"\n3. Building association tree...") - builder = AssociationTreeBuilder( - lsp_manager=manager, - timeout=10.0, - ) - - tree = await builder.build_tree( - seed_file_path=str(workspace_root / test_file), - seed_line=test_line, - seed_character=test_char, - max_depth=2, - expand_callers=True, - expand_callees=True, - ) - - print(f" Tree built successfully!") - print(f" - Roots: {len(tree.roots)}") - print(f" - Total nodes: {len(tree.node_list)}") - print(f" - Depth reached: {tree.depth_reached}") - - if tree.node_list: - print(f"\n First 5 nodes:") - for i, node in enumerate(tree.node_list[:5], 1): - print(f" {i}. {node.item.name} @ {node.item.file_path}:{node.item.range.start_line}") - print(f" Depth: {node.depth}, Is cycle: {node.is_cycle}") - - except Exception as e: - print(f"\n[ERROR] Error: {e}") - import traceback - traceback.print_exc() - - finally: - # Cleanup - print("\n4. Cleaning up...") - await manager.stop() - print(" [OK] LSP manager stopped") - -if __name__ == "__main__": - asyncio.run(test_lsp_tree()) diff --git a/codex-lens/examples/test_raw_lsp.py b/codex-lens/examples/test_raw_lsp.py deleted file mode 100644 index 18bfdc26..00000000 --- a/codex-lens/examples/test_raw_lsp.py +++ /dev/null @@ -1,104 +0,0 @@ -"""Raw LSP test with debug logging.""" - -import asyncio -import json -import logging -from pathlib import Path -from codexlens.lsp.standalone_manager import StandaloneLspManager - -# Enable debug logging -logging.basicConfig(level=logging.DEBUG) -logger = logging.getLogger("codexlens.lsp") -logger.setLevel(logging.DEBUG) - -async def test_raw_lsp(): - """Test LSP with debug logging enabled.""" - - workspace_root = Path("D:/Claude_dms3/codex-lens") - test_file = workspace_root / "test_simple_function.py" - - print("Testing Raw LSP Call Hierarchy") - print("="*80) - - # Create LSP manager - manager = StandaloneLspManager( - workspace_root=str(workspace_root), - timeout=30.0, - ) - - try: - # Start LSP manager - print("\n1. Starting LSP manager...") - await manager.start() - print(" [OK] Started") - - # Get server state - state = await manager._get_server(str(test_file)) - if not state: - print(" [ERROR] No server state!") - return - - print(f" Server initialized: {state.initialized}") - print(f" Call hierarchy supported: {state.capabilities.get('callHierarchyProvider')}") - - # Open document - print("\n2. Opening document...") - await manager._open_document(state, str(test_file)) - print(" [OK] Document opened") - - # Wait a bit for Pyright to analyze - print("\n3. Waiting for analysis...") - await asyncio.sleep(2) - print(" [OK] Waited 2 seconds") - - # Try call hierarchy on main function (line 12) - print("\n4. Sending prepareCallHierarchy request...") - - # Direct request using _send_request - params = { - "textDocument": {"uri": test_file.as_uri()}, - "position": {"line": 11, "character": 4} # 0-indexed, "main" function - } - print(f" Params: {json.dumps(params, indent=2)}") - - result = await manager._send_request( - state, - "textDocument/prepareCallHierarchy", - params, - ) - - print(f"\n5. Result: {result}") - print(f" Type: {type(result)}") - - if result: - print(f" Items: {len(result)}") - for item in result: - print(f" - {item.get('name')}") - else: - print(" [WARN] No items returned") - print(" This could mean:") - print(" - Position doesn't point to a symbol") - print(" - Pyright hasn't finished analyzing") - print(" - Some other issue") - - # Try with the higher-level API - print("\n6. Testing with get_call_hierarchy_items API...") - items = await manager.get_call_hierarchy_items( - file_path=str(test_file), - line=12, - character=5, - ) - print(f" Result: {len(items)} items") - - except Exception as e: - print(f"\n[ERROR] Error: {e}") - import traceback - traceback.print_exc() - - finally: - print("\n7. Cleanup...") - await manager.stop() - print(" [OK] Done") - -if __name__ == "__main__": - asyncio.run(test_raw_lsp()) diff --git a/codex-lens/examples/test_raw_response.py b/codex-lens/examples/test_raw_response.py deleted file mode 100644 index d5f2165f..00000000 --- a/codex-lens/examples/test_raw_response.py +++ /dev/null @@ -1,96 +0,0 @@ -"""Test to see raw LSP response.""" - -import asyncio -import json -import logging -from pathlib import Path - -# Patch the _process_messages to log the full response -async def patched_process_messages(self, language_id: str): - """Patched version that logs full response.""" - from codexlens.lsp.standalone_manager import logger - - state = self._servers.get(language_id) - if not state: - return - - try: - while True: - message = await state.message_queue.get() - msg_id = message.get("id") - method = message.get("method", "") - - # Log FULL message for debugging - if msg_id is not None and not method: - print(f"\n>>> FULL RESPONSE (id={msg_id}):") - print(json.dumps(message, indent=2)) - - # Response handling - if msg_id is not None and not method: - if msg_id in state.pending_requests: - future = state.pending_requests.pop(msg_id) - if "error" in message: - print(f">>> ERROR in response: {message['error']}") - future.set_exception( - Exception(message["error"].get("message", "Unknown error")) - ) - else: - print(f">>> Result: {message.get('result')}") - future.set_result(message.get("result")) - else: - print(f">>> No pending request for id={msg_id}") - - elif msg_id is not None and method: - await self._handle_server_request(state, message) - - elif method: - pass # Skip notifications - - state.message_queue.task_done() - - except asyncio.CancelledError: - pass - -async def test_raw(): - from codexlens.lsp.standalone_manager import StandaloneLspManager - - workspace_root = Path("D:/Claude_dms3/codex-lens") - test_file = workspace_root / "test_simple_function.py" - - manager = StandaloneLspManager(workspace_root=str(workspace_root), timeout=30.0) - - # Monkey-patch the method - import types - manager._process_messages = types.MethodType(patched_process_messages, manager) - - try: - print("Starting LSP...") - await manager.start() - - state = await manager._get_server(str(test_file)) - await manager._open_document(state, str(test_file)) - await asyncio.sleep(2) - - print("\nSending prepareCallHierarchy request...") - uri = test_file.resolve().as_uri() - params = { - "textDocument": {"uri": uri}, - "position": {"line": 11, "character": 4} - } - - # Need to restart the message processor with our patched version - # Actually, the original is already running. Let's just send and see logs. - - result = await manager._send_request( - state, - "textDocument/prepareCallHierarchy", - params - ) - - print(f"\nFinal result: {result}") - - finally: - await manager.stop() - -if __name__ == "__main__": - asyncio.run(test_raw()) diff --git a/codex-lens/examples/test_simple_call_hierarchy.py b/codex-lens/examples/test_simple_call_hierarchy.py deleted file mode 100644 index 8ecdfea8..00000000 --- a/codex-lens/examples/test_simple_call_hierarchy.py +++ /dev/null @@ -1,87 +0,0 @@ -"""Test call hierarchy on a simple Python file.""" - -import asyncio -from pathlib import Path -from codexlens.lsp.standalone_manager import StandaloneLspManager - -async def test_simple_call_hierarchy(): - """Test call hierarchy on test_simple_function.py.""" - - workspace_root = Path("D:/Claude_dms3/codex-lens") - test_file = workspace_root / "test_simple_function.py" - - print("Testing Call Hierarchy on Simple Function") - print("="*80) - print(f"File: {test_file}") - - # Create LSP manager - manager = StandaloneLspManager( - workspace_root=str(workspace_root), - timeout=10.0, - ) - - try: - # Start LSP manager - print("\n1. Starting LSP manager...") - await manager.start() - print(" [OK] LSP manager started") - - # Test different function positions - test_cases = [ - ("hello_world", 4, 5, "def hello_world():"), - ("greet", 8, 5, "def greet(name: str):"), - ("main", 12, 5, "def main():"), - ] - - for func_name, line, char, expected in test_cases: - print(f"\n2. Testing {func_name} at line {line}:") - print(f" Expected: {expected}") - - items = await manager.get_call_hierarchy_items( - file_path=str(test_file), - line=line, - character=char, - ) - - print(f" Result: {len(items)} items") - if items: - for i, item in enumerate(items, 1): - print(f" {i}. Name: {item.get('name')}") - print(f" Kind: {item.get('kind')}") - print(f" URI: {item.get('uri')}") - range_obj = item.get('range', {}) - start = range_obj.get('start', {}) - print(f" Line: {start.get('line', 0) + 1}") - - # If we got items, try getting incoming/outgoing calls - print(f"\n Testing incoming/outgoing calls for {func_name}:") - first_item = items[0] - - incoming = await manager.get_incoming_calls(first_item) - print(f" - Incoming calls: {len(incoming)}") - for call in incoming: - caller = call.get('from', {}) - print(f" Called by: {caller.get('name')}") - - outgoing = await manager.get_outgoing_calls(first_item) - print(f" - Outgoing calls: {len(outgoing)}") - for call in outgoing: - callee = call.get('to', {}) - print(f" Calls: {callee.get('name')}") - - else: - print(f" [WARN] No call hierarchy items for {func_name}!") - - except Exception as e: - print(f"\n[ERROR] Error: {e}") - import traceback - traceback.print_exc() - - finally: - # Cleanup - print("\n3. Cleaning up...") - await manager.stop() - print(" [OK] LSP manager stopped") - -if __name__ == "__main__": - asyncio.run(test_simple_call_hierarchy()) diff --git a/codex-lens/examples/test_uri_consistency.py b/codex-lens/examples/test_uri_consistency.py deleted file mode 100644 index 710f810c..00000000 --- a/codex-lens/examples/test_uri_consistency.py +++ /dev/null @@ -1,98 +0,0 @@ -"""Test if URI inconsistency causes the issue.""" - -import asyncio -import json -from pathlib import Path -from codexlens.lsp.standalone_manager import StandaloneLspManager - -async def test_with_consistent_uri(): - """Test prepareCallHierarchy with different URI formats.""" - - workspace_root = Path("D:/Claude_dms3/codex-lens") - test_file = workspace_root / "test_simple_function.py" - resolved = test_file.resolve() - - print("Testing URI Consistency") - print("="*80) - - # Different URI formats to try - uri_standard = resolved.as_uri() - uri_lowercase = uri_standard.replace("file:///D:", "file:///d:") - - print(f"Standard URI: {uri_standard}") - print(f"Lowercase URI: {uri_lowercase}") - - manager = StandaloneLspManager( - workspace_root=str(workspace_root), - timeout=30.0, - ) - - try: - print("\n1. Starting LSP manager...") - await manager.start() - - state = await manager._get_server(str(test_file)) - if not state: - print(" [ERROR] No server state") - return - - print(" [OK] Server ready") - - # Open document - print("\n2. Opening document...") - await manager._open_document(state, str(test_file)) - await asyncio.sleep(2) - print(" [OK] Document opened, waited 2s") - - # Test 1: Standard URI (as_uri) - print("\n3. Test with standard URI...") - params1 = { - "textDocument": {"uri": uri_standard}, - "position": {"line": 11, "character": 4} # main function - } - print(f" Params: {json.dumps(params1)}") - result1 = await manager._send_request(state, "textDocument/prepareCallHierarchy", params1) - print(f" Result: {result1}") - - # Test 2: Lowercase drive letter - print("\n4. Test with lowercase drive letter URI...") - params2 = { - "textDocument": {"uri": uri_lowercase}, - "position": {"line": 11, "character": 4} - } - print(f" Params: {json.dumps(params2)}") - result2 = await manager._send_request(state, "textDocument/prepareCallHierarchy", params2) - print(f" Result: {result2}") - - # Test 3: Position at function name start - print("\n5. Test with position at 'def' keyword (char 0)...") - params3 = { - "textDocument": {"uri": uri_lowercase}, - "position": {"line": 11, "character": 0} - } - result3 = await manager._send_request(state, "textDocument/prepareCallHierarchy", params3) - print(f" Result: {result3}") - - # Test 4: Different positions on line 12 (1-indexed = line 11 0-indexed) - print("\n6. Testing different character positions on 'def main():'...") - for char in [0, 4, 5, 6, 7, 8]: - params = { - "textDocument": {"uri": uri_lowercase}, - "position": {"line": 11, "character": char} - } - result = await manager._send_request(state, "textDocument/prepareCallHierarchy", params) - status = "OK" if result else "None" - print(f" char={char}: {status} - {result[:1] if result else '[]'}") - - except Exception as e: - print(f"\n[ERROR] {e}") - import traceback - traceback.print_exc() - - finally: - print("\n7. Cleanup...") - await manager.stop() - print(" [OK]") - -if __name__ == "__main__": - asyncio.run(test_with_consistent_uri()) diff --git a/codex-lens/examples/test_wait_for_analysis.py b/codex-lens/examples/test_wait_for_analysis.py deleted file mode 100644 index bba6af23..00000000 --- a/codex-lens/examples/test_wait_for_analysis.py +++ /dev/null @@ -1,99 +0,0 @@ -"""Test with longer wait time for Pyright analysis.""" - -import asyncio -import json -from pathlib import Path -from codexlens.lsp.standalone_manager import StandaloneLspManager - -async def test_with_wait(): - """Test prepareCallHierarchy with longer wait for analysis.""" - - workspace_root = Path("D:/Claude_dms3/codex-lens") - test_file = workspace_root / "test_simple_function.py" - - print("Testing with Wait for Analysis") - print("="*80) - - manager = StandaloneLspManager( - workspace_root=str(workspace_root), - timeout=30.0, - ) - - try: - print("\n1. Starting LSP manager...") - await manager.start() - - state = await manager._get_server(str(test_file)) - if not state: - print(" [ERROR] No server state") - return - - print(" [OK] Server ready") - print(f" Workspace: {manager.workspace_root}") - - # Open document - print("\n2. Opening document...") - await manager._open_document(state, str(test_file)) - print(" [OK] Document opened") - - # Wait longer for analysis - print("\n3. Waiting for Pyright to analyze (5 seconds)...") - await asyncio.sleep(5) - print(" [OK] Wait complete") - - # Check diagnostics first to verify file is analyzed - print("\n4. Checking if document symbols work (to verify analysis)...") - symbols = await manager._send_request( - state, - "textDocument/documentSymbol", - {"textDocument": {"uri": test_file.resolve().as_uri()}} - ) - if symbols: - print(f" [OK] Found {len(symbols)} symbols:") - for s in symbols: - name = s.get('name', 'unknown') - kind = s.get('kind', 0) - range_info = s.get('range', {}).get('start', {}) - line = range_info.get('line', 0) + 1 - print(f" - {name} (kind={kind}) at line {line}") - else: - print(" [WARN] No symbols found!") - - # Now try call hierarchy on different lines - print("\n5. Testing prepareCallHierarchy on each symbol...") - if symbols: - for s in symbols: - name = s.get('name', 'unknown') - range_info = s.get('range', {}).get('start', {}) - line = range_info.get('line', 0) - char = range_info.get('character', 0) - - params = { - "textDocument": {"uri": test_file.resolve().as_uri()}, - "position": {"line": line, "character": char + 4} # offset into name - } - - result = await manager._send_request( - state, - "textDocument/prepareCallHierarchy", - params - ) - - status = f"[OK] {len(result)} items" if result else "[NONE]" - print(f" {name} (line {line+1}, char {char+4}): {status}") - if result: - for item in result: - print(f" - {item.get('name')}") - - except Exception as e: - print(f"\n[ERROR] {e}") - import traceback - traceback.print_exc() - - finally: - print("\n6. Cleanup...") - await manager.stop() - print(" [OK]") - -if __name__ == "__main__": - asyncio.run(test_with_wait()) diff --git a/codex-lens/lsp-servers.json b/codex-lens/lsp-servers.json deleted file mode 100644 index 4120d60d..00000000 --- a/codex-lens/lsp-servers.json +++ /dev/null @@ -1,88 +0,0 @@ -{ - "$schema": "https://json-schema.org/draft/2020-12/schema", - "version": "1.0.0", - "description": "Language Server configuration for codex-lens standalone LSP client", - "servers": [ - { - "languageId": "python", - "displayName": "Pyright", - "extensions": ["py", "pyi"], - "command": ["pyright-langserver", "--stdio"], - "enabled": true, - "initializationOptions": { - "pythonPath": "", - "pythonPlatform": "", - "pythonVersion": "3.13" - }, - "settings": { - "python.analysis": { - "typeCheckingMode": "standard", - "diagnosticMode": "workspace", - "exclude": ["**/node_modules", "**/__pycache__", "build", "dist"], - "include": ["src/**", "tests/**"], - "stubPath": "typings" - } - } - }, - { - "languageId": "typescript", - "displayName": "TypeScript Language Server", - "extensions": ["ts", "tsx"], - "command": ["typescript-language-server.cmd", "--stdio"], - "enabled": true, - "initializationOptions": {}, - "settings": {} - }, - { - "languageId": "javascript", - "displayName": "TypeScript Language Server (for JS)", - "extensions": ["js", "jsx", "mjs", "cjs"], - "command": ["typescript-language-server.cmd", "--stdio"], - "enabled": true, - "initializationOptions": {}, - "settings": {} - }, - { - "languageId": "go", - "displayName": "Gopls", - "extensions": ["go"], - "command": ["gopls", "serve"], - "enabled": true, - "initializationOptions": {}, - "settings": {} - }, - { - "languageId": "rust", - "displayName": "Rust Analyzer", - "extensions": ["rs"], - "command": ["rust-analyzer"], - "enabled": false, - "initializationOptions": {}, - "settings": {} - }, - { - "languageId": "c", - "displayName": "Clangd", - "extensions": ["c", "h"], - "command": ["clangd"], - "enabled": false, - "initializationOptions": {}, - "settings": {} - }, - { - "languageId": "cpp", - "displayName": "Clangd", - "extensions": ["cpp", "hpp", "cc", "cxx"], - "command": ["clangd"], - "enabled": false, - "initializationOptions": {}, - "settings": {} - } - ], - "defaults": { - "rootDir": ".", - "timeout": 30000, - "restartInterval": 5000, - "maxRestarts": 3 - } -} diff --git a/codex-lens/misleading_test.db b/codex-lens/misleading_test.db deleted file mode 100644 index 42aa2fa6..00000000 Binary files a/codex-lens/misleading_test.db and /dev/null differ diff --git a/codex-lens/pyproject.toml b/codex-lens/pyproject.toml deleted file mode 100644 index 71dd763a..00000000 --- a/codex-lens/pyproject.toml +++ /dev/null @@ -1,127 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[project] -name = "codex-lens" -version = "0.1.0" -description = "CodexLens multi-modal code analysis platform" -readme = "README.md" -requires-python = ">=3.10" -license = "MIT" -authors = [ - { name = "CodexLens contributors" } -] -dependencies = [ - "typer~=0.9.0", - "click>=8.0.0,<9", - "rich~=13.0.0", - "pydantic~=2.0.0", - "tree-sitter~=0.20.0", - "tree-sitter-python~=0.25.0", - "tree-sitter-javascript~=0.25.0", - "tree-sitter-typescript~=0.23.0", - "pathspec~=0.11.0", - "watchdog~=3.0.0", - # ast-grep for pattern-based AST matching (PyO3 bindings) - # ast-grep-py 0.40+ supports Python 3.13 - "ast-grep-py~=0.40.0", -] - -[project.optional-dependencies] -# Semantic search using fastembed (ONNX-based, lightweight ~200MB) -semantic = [ - "numpy~=1.26.0", - "fastembed~=0.2.1", - "hnswlib~=0.8.0", -] - -# GPU acceleration for semantic search (NVIDIA CUDA) -# Install with: pip install codexlens[semantic-gpu] -semantic-gpu = [ - "numpy~=1.26.0", - "fastembed~=0.2.1", - "hnswlib~=0.8.0", - "onnxruntime-gpu~=1.15.0", # CUDA support -] - -# GPU acceleration for Windows (DirectML - supports NVIDIA/AMD/Intel) -# Install with: pip install codexlens[semantic-directml] -semantic-directml = [ - "numpy~=1.26.0", - "fastembed~=0.2.1", - "hnswlib~=0.8.0", - "onnxruntime-directml~=1.15.0", # DirectML support -] - -# Cross-encoder reranking (second-stage, optional) -# Install with: pip install codexlens[reranker] (default: ONNX backend) -reranker-onnx = [ - "optimum[onnxruntime]~=2.1.0", - "onnxruntime~=1.23.0", - "transformers~=4.53.0", -] - -# Remote reranking via HTTP API -reranker-api = [ - "httpx~=0.25.0", -] - -# LLM-based reranking via ccw-litellm -reranker-litellm = [ - "ccw-litellm~=0.1.0", -] - -# Legacy sentence-transformers CrossEncoder reranker -reranker-legacy = [ - "sentence-transformers~=2.2.0", -] - -# Backward-compatible alias for default reranker backend -reranker = [ - "optimum[onnxruntime]~=2.1.0", - "onnxruntime~=1.23.0", - "transformers~=4.53.0", -] - -# Encoding detection for non-UTF8 files -encoding = [ - "chardet~=5.0.0", -] - -# Clustering for staged hybrid search (HDBSCAN + sklearn) -clustering = [ - "hdbscan~=0.8.1", - "scikit-learn~=1.3.0", -] - -# Full features including tiktoken for accurate token counting -full = [ - "tiktoken~=0.5.0", -] - -# Language Server Protocol support -lsp = [ - "pygls~=1.3.0", -] - -[project.scripts] -codexlens-lsp = "codexlens.lsp.server:main" - -[project.urls] -Homepage = "https://github.com/openai/codex-lens" - -[tool.setuptools] -package-dir = { "" = "src" } - -[tool.setuptools.package-data] -"codexlens.lsp" = ["lsp-servers.json"] - -[tool.pytest.ini_options] -markers = [ - "integration: marks tests that exercise broader end-to-end or dependency-heavy flows", -] -filterwarnings = [ - "ignore:'BaseCommand' is deprecated and will be removed in Click 9.0.*:DeprecationWarning", - "ignore:The '__version__' attribute is deprecated and will be removed in Click 9.1.*:DeprecationWarning", -] diff --git a/codex-lens/requirements.in b/codex-lens/requirements.in deleted file mode 100644 index 23638436..00000000 --- a/codex-lens/requirements.in +++ /dev/null @@ -1,22 +0,0 @@ -# Core dependencies for codex-lens -# This file tracks direct dependencies only -# Run: pip-compile requirements.in --output-file=requirements.txt - -typer~=0.9.0 -rich~=13.0.0 -pydantic~=2.0.0 -tree-sitter~=0.20.0 -tree-sitter-python~=0.25.0 -tree-sitter-javascript~=0.25.0 -tree-sitter-typescript~=0.23.0 -pathspec~=0.11.0 -watchdog~=3.0.0 -ast-grep-py~=0.40.0 - -# Semantic search dependencies -numpy~=1.24.0 -fastembed~=0.2.0 -hnswlib~=0.8.0 - -# LSP support -pygls~=1.3.0 diff --git a/codex-lens/scripts/bootstrap_reranker_local.py b/codex-lens/scripts/bootstrap_reranker_local.py deleted file mode 100644 index 7cc1d15e..00000000 --- a/codex-lens/scripts/bootstrap_reranker_local.py +++ /dev/null @@ -1,340 +0,0 @@ -#!/usr/bin/env python3 -"""Bootstrap a local-only ONNX reranker environment for CodexLens. - -This script defaults to dry-run output so it can be used as a reproducible -bootstrap manifest. When `--apply` is passed, it installs pinned reranker -packages into the selected virtual environment and can optionally pre-download -the ONNX reranker model into a repo-local Hugging Face cache. - -Examples: - python scripts/bootstrap_reranker_local.py --dry-run - python scripts/bootstrap_reranker_local.py --apply --download-model - python scripts/bootstrap_reranker_local.py --venv .venv --model Xenova/ms-marco-MiniLM-L-12-v2 -""" - -from __future__ import annotations - -import argparse -import os -import shlex -import subprocess -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Iterable - - -PROJECT_ROOT = Path(__file__).resolve().parents[1] -MANIFEST_PATH = Path(__file__).with_name("requirements-reranker-local.txt") -DEFAULT_MODEL = "Xenova/ms-marco-MiniLM-L-6-v2" -DEFAULT_HF_HOME = PROJECT_ROOT / ".cache" / "huggingface" - -STEP_NOTES = { - "runtime": "Install the local ONNX runtime first so optimum/transformers do not backtrack over runtime wheels.", - "hf-stack": "Pin the Hugging Face stack used by the ONNX reranker backend.", -} - - -@dataclass(frozen=True) -class RequirementStep: - name: str - packages: tuple[str, ...] - - -def _normalize_venv_path(raw_path: str | Path) -> Path: - return (Path(raw_path) if raw_path else PROJECT_ROOT / ".venv").expanduser().resolve() - - -def _venv_python(venv_path: Path) -> Path: - if os.name == "nt": - return venv_path / "Scripts" / "python.exe" - return venv_path / "bin" / "python" - - -def _venv_huggingface_cli(venv_path: Path) -> Path: - if os.name == "nt": - preferred = venv_path / "Scripts" / "hf.exe" - return preferred if preferred.exists() else venv_path / "Scripts" / "huggingface-cli.exe" - preferred = venv_path / "bin" / "hf" - return preferred if preferred.exists() else venv_path / "bin" / "huggingface-cli" - - -def _default_shell() -> str: - return "powershell" if os.name == "nt" else "bash" - - -def _shell_quote(value: str, shell: str) -> str: - if shell == "bash": - return shlex.quote(value) - return "'" + value.replace("'", "''") + "'" - - -def _format_command(parts: Iterable[str], shell: str) -> str: - return " ".join(_shell_quote(str(part), shell) for part in parts) - - -def _format_set_env(name: str, value: str, shell: str) -> str: - quoted_value = _shell_quote(value, shell) - if shell == "bash": - return f"export {name}={quoted_value}" - return f"$env:{name} = {quoted_value}" - - -def _model_local_dir(hf_home: Path, model_name: str) -> Path: - slug = model_name.replace("/", "--") - return hf_home / "models" / slug - - -def _parse_manifest(manifest_path: Path) -> list[RequirementStep]: - current_name: str | None = None - current_packages: list[str] = [] - steps: list[RequirementStep] = [] - - for raw_line in manifest_path.read_text(encoding="utf-8").splitlines(): - line = raw_line.strip() - if not line: - continue - - if line.startswith("# [") and line.endswith("]"): - if current_name and current_packages: - steps.append(RequirementStep(current_name, tuple(current_packages))) - current_name = line[3:-1] - current_packages = [] - continue - - if line.startswith("#"): - continue - - if current_name is None: - raise ValueError(f"Package entry found before a section header in {manifest_path}") - current_packages.append(line) - - if current_name and current_packages: - steps.append(RequirementStep(current_name, tuple(current_packages))) - - if not steps: - raise ValueError(f"No requirement steps found in {manifest_path}") - return steps - - -def _pip_install_command(python_path: Path, packages: Iterable[str]) -> list[str]: - return [ - str(python_path), - "-m", - "pip", - "install", - "--upgrade", - "--disable-pip-version-check", - "--upgrade-strategy", - "only-if-needed", - "--only-binary=:all:", - *packages, - ] - - -def _probe_command(python_path: Path) -> list[str]: - return [ - str(python_path), - "-c", - ( - "from codexlens.semantic.reranker.factory import check_reranker_available; " - "print(check_reranker_available('onnx'))" - ), - ] - - -def _download_command(huggingface_cli: Path, model_name: str, model_dir: Path) -> list[str]: - return [ - str(huggingface_cli), - "download", - model_name, - "--local-dir", - str(model_dir), - ] - - -def _print_plan( - shell: str, - venv_path: Path, - python_path: Path, - huggingface_cli: Path, - manifest_path: Path, - steps: list[RequirementStep], - model_name: str, - hf_home: Path, -) -> None: - model_dir = _model_local_dir(hf_home, model_name) - - print("CodexLens local reranker bootstrap") - print(f"manifest: {manifest_path}") - print(f"target_venv: {venv_path}") - print(f"target_python: {python_path}") - print(f"backend: onnx") - print(f"model: {model_name}") - print(f"hf_home: {hf_home}") - print("mode: dry-run") - print("notes:") - print("- Uses only the selected venv Python; no global pip commands are emitted.") - print("- Targets the local ONNX reranker backend only; no API or LiteLLM providers are involved.") - print("") - print("pinned_steps:") - for step in steps: - print(f"- {step.name}: {', '.join(step.packages)}") - note = STEP_NOTES.get(step.name) - if note: - print(f" note: {note}") - print("") - print("commands:") - print( - "1. " - + _format_command( - [ - str(python_path), - "-m", - "pip", - "install", - "--upgrade", - "pip", - "setuptools", - "wheel", - ], - shell, - ) - ) - command_index = 2 - for step in steps: - print(f"{command_index}. " + _format_command(_pip_install_command(python_path, step.packages), shell)) - command_index += 1 - print(f"{command_index}. " + _format_set_env("HF_HOME", str(hf_home), shell)) - command_index += 1 - print(f"{command_index}. " + _format_command(_download_command(huggingface_cli, model_name, model_dir), shell)) - command_index += 1 - print(f"{command_index}. " + _format_command(_probe_command(python_path), shell)) - print("") - print("optional_runtime_env:") - print(_format_set_env("RERANKER_BACKEND", "onnx", shell)) - print(_format_set_env("RERANKER_MODEL", str(model_dir), shell)) - print(_format_set_env("HF_HOME", str(hf_home), shell)) - - -def _run_command(command: list[str], *, env: dict[str, str] | None = None) -> None: - command_env = os.environ.copy() - if env: - command_env.update(env) - command_env.setdefault("PYTHONUTF8", "1") - command_env.setdefault("PYTHONIOENCODING", "utf-8") - subprocess.run(command, check=True, env=command_env) - - -def main() -> int: - parser = argparse.ArgumentParser( - description="Bootstrap pinned local-only ONNX reranker dependencies for a CodexLens virtual environment.", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=__doc__, - ) - parser.add_argument( - "--venv", - type=Path, - default=PROJECT_ROOT / ".venv", - help="Path to the CodexLens virtual environment (default: ./.venv under codex-lens).", - ) - parser.add_argument( - "--model", - default=DEFAULT_MODEL, - help=f"Model repo to pre-download for local reranking (default: {DEFAULT_MODEL}).", - ) - parser.add_argument( - "--hf-home", - type=Path, - default=DEFAULT_HF_HOME, - help="Repo-local Hugging Face cache directory used for optional model downloads.", - ) - parser.add_argument( - "--shell", - choices=("powershell", "bash"), - default=_default_shell(), - help="Shell syntax to use when rendering dry-run commands.", - ) - parser.add_argument( - "--apply", - action="store_true", - help="Execute the pinned install steps against the selected virtual environment.", - ) - parser.add_argument( - "--download-model", - action="store_true", - help="When used with --apply, pre-download the model into the configured HF_HOME directory.", - ) - parser.add_argument( - "--probe", - action="store_true", - help="When used with --apply, run a small reranker availability probe at the end.", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="Print the deterministic bootstrap plan. This is also the default when --apply is omitted.", - ) - - args = parser.parse_args() - - steps = _parse_manifest(MANIFEST_PATH) - venv_path = _normalize_venv_path(args.venv) - python_path = _venv_python(venv_path) - huggingface_cli = _venv_huggingface_cli(venv_path) - hf_home = args.hf_home.expanduser().resolve() - - if not args.apply: - _print_plan( - shell=args.shell, - venv_path=venv_path, - python_path=python_path, - huggingface_cli=huggingface_cli, - manifest_path=MANIFEST_PATH, - steps=steps, - model_name=args.model, - hf_home=hf_home, - ) - return 0 - - if not python_path.exists(): - print(f"Target venv Python not found: {python_path}", file=sys.stderr) - return 1 - - _run_command( - [ - str(python_path), - "-m", - "pip", - "install", - "--upgrade", - "pip", - "setuptools", - "wheel", - ] - ) - for step in steps: - _run_command(_pip_install_command(python_path, step.packages)) - - if args.download_model: - if not huggingface_cli.exists(): - print(f"Expected venv-local Hugging Face CLI not found: {huggingface_cli}", file=sys.stderr) - return 1 - download_env = os.environ.copy() - download_env["HF_HOME"] = str(hf_home) - hf_home.mkdir(parents=True, exist_ok=True) - _run_command(_download_command(huggingface_cli, args.model, _model_local_dir(hf_home, args.model)), env=download_env) - - if args.probe: - local_model_dir = _model_local_dir(hf_home, args.model) - probe_env = os.environ.copy() - probe_env["HF_HOME"] = str(hf_home) - probe_env.setdefault("RERANKER_BACKEND", "onnx") - probe_env.setdefault("RERANKER_MODEL", str(local_model_dir if local_model_dir.exists() else args.model)) - _run_command(_probe_command(python_path), env=probe_env) - - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/codex-lens/scripts/generate_embeddings.py b/codex-lens/scripts/generate_embeddings.py deleted file mode 100644 index c2b6a0a3..00000000 --- a/codex-lens/scripts/generate_embeddings.py +++ /dev/null @@ -1,278 +0,0 @@ -#!/usr/bin/env python3 -"""Generate vector embeddings for existing CodexLens indexes. - -This script is a CLI wrapper around the memory-efficient streaming implementation -in codexlens.cli.embedding_manager. It uses batch processing to keep memory usage -under 2GB regardless of project size. - -Requirements: - pip install codexlens[semantic] - # or - pip install fastembed numpy hnswlib - -Usage: - # Generate embeddings for a single index - python generate_embeddings.py /path/to/_index.db - - # Use specific embedding model - python generate_embeddings.py /path/to/_index.db --model code - - # Generate centralized embeddings for all indexes in a directory - python generate_embeddings.py --centralized ~/.codexlens/indexes - - # Force regeneration - python generate_embeddings.py /path/to/_index.db --force -""" - -import argparse -import logging -import sys -import warnings -from pathlib import Path - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s', - datefmt='%H:%M:%S' -) -logger = logging.getLogger(__name__) - -# Import the memory-efficient implementation -try: - from codexlens.cli.embedding_manager import ( - generate_embeddings, - generate_dense_embeddings_centralized, - ) - from codexlens.semantic import SEMANTIC_AVAILABLE -except ImportError as exc: - logger.error(f"Failed to import codexlens: {exc}") - logger.error("Make sure codexlens is installed: pip install codexlens") - SEMANTIC_AVAILABLE = False - - -def check_dependencies(): - """Check if semantic search dependencies are available.""" - if not SEMANTIC_AVAILABLE: - logger.error("Semantic search dependencies not available") - logger.error("Install with: pip install codexlens[semantic]") - logger.error("Or: pip install fastembed numpy hnswlib") - return False - return True - - -def progress_callback(message: str): - """Callback function for progress updates.""" - logger.info(message) - - -def generate_embeddings_for_index( - index_db_path: Path, - model_profile: str = "code", - force: bool = False, - chunk_size: int = 2000, - **kwargs # Ignore unused parameters (workers, batch_size) for backward compatibility -) -> dict: - """Generate embeddings for an index using memory-efficient streaming. - - This function wraps the streaming implementation from embedding_manager - to maintain CLI compatibility while using the memory-optimized approach. - - Args: - index_db_path: Path to _index.db file - model_profile: Model profile to use (fast, code, multilingual, balanced) - force: If True, regenerate even if embeddings exist - chunk_size: Maximum chunk size in characters - **kwargs: Additional parameters (ignored for compatibility) - - Returns: - Dictionary with generation statistics - """ - logger.info(f"Processing index: {index_db_path}") - - # Call the memory-efficient streaming implementation - result = generate_embeddings( - index_path=index_db_path, - model_profile=model_profile, - force=force, - chunk_size=chunk_size, - progress_callback=progress_callback, - ) - - if not result["success"]: - if "error" in result: - logger.error(result["error"]) - return result - - # Extract result data and log summary - data = result["result"] - logger.info("=" * 60) - logger.info(f"Completed in {data['elapsed_time']:.1f}s") - logger.info(f"Total chunks created: {data['chunks_created']}") - logger.info(f"Files processed: {data['files_processed']}") - if data['files_failed'] > 0: - logger.warning(f"Failed files: {data['files_failed']}") - if data.get('failed_files'): - for file_path, error in data['failed_files']: - logger.warning(f" {file_path}: {error}") - - return { - "success": True, - "chunks_created": data["chunks_created"], - "files_processed": data["files_processed"], - "files_failed": data["files_failed"], - "elapsed_time": data["elapsed_time"], - } - - -def main(): - parser = argparse.ArgumentParser( - description="Generate vector embeddings for CodexLens indexes (memory-efficient streaming)", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=__doc__ - ) - - parser.add_argument( - "index_path", - type=Path, - help="Path to _index.db file or directory for centralized mode" - ) - - parser.add_argument( - "--centralized", - "-c", - action="store_true", - help="Use centralized vector storage (single HNSW index at project root)" - ) - - parser.add_argument( - "--scan", - action="store_true", - help="(Deprecated) Use --centralized instead" - ) - - parser.add_argument( - "--model", - type=str, - default="code", - choices=["fast", "code", "multilingual", "balanced"], - help="Embedding model profile (default: code)" - ) - - parser.add_argument( - "--chunk-size", - type=int, - default=2000, - help="Maximum chunk size in characters (default: 2000)" - ) - - parser.add_argument( - "--workers", - type=int, - default=0, - help="(Deprecated) Kept for backward compatibility, ignored" - ) - - parser.add_argument( - "--batch-size", - type=int, - default=256, - help="(Deprecated) Kept for backward compatibility, ignored" - ) - - parser.add_argument( - "--force", - action="store_true", - help="Regenerate embeddings even if they exist" - ) - - parser.add_argument( - "--verbose", - "-v", - action="store_true", - help="Enable verbose logging" - ) - - args = parser.parse_args() - - # Configure logging level - if args.verbose: - logging.getLogger().setLevel(logging.DEBUG) - - # Check dependencies - if not check_dependencies(): - sys.exit(1) - - # Resolve path - index_path = args.index_path.expanduser().resolve() - - if not index_path.exists(): - logger.error(f"Path not found: {index_path}") - sys.exit(1) - - # Handle deprecated --scan flag - use_centralized = args.centralized - if args.scan: - warnings.warn( - "--scan is deprecated, use --centralized instead", - DeprecationWarning - ) - logger.warning("--scan is deprecated. Use --centralized instead.") - use_centralized = True - - # Determine if using centralized mode or single file - if use_centralized or index_path.is_dir(): - # Centralized mode - single HNSW index at project root - if index_path.is_file(): - logger.error("--centralized requires a directory path") - sys.exit(1) - - logger.info(f"Generating centralized embeddings for: {index_path}") - result = generate_dense_embeddings_centralized( - index_root=index_path, - model_profile=args.model, - force=args.force, - chunk_size=args.chunk_size, - progress_callback=progress_callback, - ) - - if not result["success"]: - logger.error(f"Failed: {result.get('error', 'Unknown error')}") - sys.exit(1) - - # Log summary - data = result["result"] - logger.info(f"\n{'='*60}") - logger.info("CENTRALIZED EMBEDDING COMPLETE") - logger.info(f"{'='*60}") - logger.info(f"Total chunks created: {data['chunks_created']}") - logger.info(f"Total files processed: {data['files_processed']}") - if data.get('files_failed', 0) > 0: - logger.warning(f"Total files failed: {data['files_failed']}") - logger.info(f"Central index: {data.get('central_index_path', 'N/A')}") - logger.info(f"Time: {data.get('elapsed_time', 0):.1f}s") - - else: - # Single index mode - if not index_path.name.endswith("_index.db"): - logger.error("File must be named '_index.db'") - sys.exit(1) - - result = generate_embeddings_for_index( - index_path, - model_profile=args.model, - force=args.force, - chunk_size=args.chunk_size, - ) - - if not result["success"]: - logger.error(f"Failed: {result.get('error', 'Unknown error')}") - sys.exit(1) - - logger.info("\nv Embeddings generation complete!") - logger.info("\nYou can now use vector search:") - logger.info(" codexlens search 'your query' --mode pure-vector") - - -if __name__ == "__main__": - main() diff --git a/codex-lens/scripts/requirements-reranker-local.txt b/codex-lens/scripts/requirements-reranker-local.txt deleted file mode 100644 index 789e742b..00000000 --- a/codex-lens/scripts/requirements-reranker-local.txt +++ /dev/null @@ -1,13 +0,0 @@ -# Ordered local ONNX reranker pins for CodexLens. -# Validated against the repo-local Python 3.13 virtualenv on Windows. -# bootstrap_reranker_local.py installs each section in file order to keep -# pip resolver work bounded and repeatable. - -# [runtime] -numpy==2.4.0 -onnxruntime==1.23.2 - -# [hf-stack] -huggingface-hub==0.36.2 -transformers==4.53.3 -optimum[onnxruntime]==2.1.0 diff --git a/codex-lens/src/.gitignore b/codex-lens/src/.gitignore deleted file mode 100644 index b4a7d405..00000000 --- a/codex-lens/src/.gitignore +++ /dev/null @@ -1 +0,0 @@ -.ace-tool/ diff --git a/codex-lens/src/codex_lens.egg-info/PKG-INFO b/codex-lens/src/codex_lens.egg-info/PKG-INFO deleted file mode 100644 index cb0fdbd8..00000000 --- a/codex-lens/src/codex_lens.egg-info/PKG-INFO +++ /dev/null @@ -1,119 +0,0 @@ -Metadata-Version: 2.4 -Name: codex-lens -Version: 0.1.0 -Summary: CodexLens multi-modal code analysis platform -Author: CodexLens contributors -License-Expression: MIT -Project-URL: Homepage, https://github.com/openai/codex-lens -Requires-Python: >=3.10 -Description-Content-Type: text/markdown -License-File: LICENSE -Requires-Dist: typer~=0.9.0 -Requires-Dist: click<9,>=8.0.0 -Requires-Dist: rich~=13.0.0 -Requires-Dist: pydantic~=2.0.0 -Requires-Dist: tree-sitter~=0.20.0 -Requires-Dist: tree-sitter-python~=0.25.0 -Requires-Dist: tree-sitter-javascript~=0.25.0 -Requires-Dist: tree-sitter-typescript~=0.23.0 -Requires-Dist: pathspec~=0.11.0 -Requires-Dist: watchdog~=3.0.0 -Requires-Dist: ast-grep-py~=0.40.0 -Provides-Extra: semantic -Requires-Dist: numpy~=1.26.0; extra == "semantic" -Requires-Dist: fastembed~=0.2.1; extra == "semantic" -Requires-Dist: hnswlib~=0.8.0; extra == "semantic" -Provides-Extra: semantic-gpu -Requires-Dist: numpy~=1.26.0; extra == "semantic-gpu" -Requires-Dist: fastembed~=0.2.1; extra == "semantic-gpu" -Requires-Dist: hnswlib~=0.8.0; extra == "semantic-gpu" -Requires-Dist: onnxruntime-gpu~=1.15.0; extra == "semantic-gpu" -Provides-Extra: semantic-directml -Requires-Dist: numpy~=1.26.0; extra == "semantic-directml" -Requires-Dist: fastembed~=0.2.1; extra == "semantic-directml" -Requires-Dist: hnswlib~=0.8.0; extra == "semantic-directml" -Requires-Dist: onnxruntime-directml~=1.15.0; extra == "semantic-directml" -Provides-Extra: reranker-onnx -Requires-Dist: optimum~=1.16.0; extra == "reranker-onnx" -Requires-Dist: onnxruntime~=1.15.0; extra == "reranker-onnx" -Requires-Dist: transformers~=4.36.0; extra == "reranker-onnx" -Provides-Extra: reranker-api -Requires-Dist: httpx~=0.25.0; extra == "reranker-api" -Provides-Extra: reranker-litellm -Requires-Dist: ccw-litellm~=0.1.0; extra == "reranker-litellm" -Provides-Extra: reranker-legacy -Requires-Dist: sentence-transformers~=2.2.0; extra == "reranker-legacy" -Provides-Extra: reranker -Requires-Dist: optimum~=1.16.0; extra == "reranker" -Requires-Dist: onnxruntime~=1.15.0; extra == "reranker" -Requires-Dist: transformers~=4.36.0; extra == "reranker" -Provides-Extra: encoding -Requires-Dist: chardet~=5.0.0; extra == "encoding" -Provides-Extra: clustering -Requires-Dist: hdbscan~=0.8.1; extra == "clustering" -Requires-Dist: scikit-learn~=1.3.0; extra == "clustering" -Provides-Extra: full -Requires-Dist: tiktoken~=0.5.0; extra == "full" -Provides-Extra: lsp -Requires-Dist: pygls~=1.3.0; extra == "lsp" -Dynamic: license-file - -# CodexLens - -CodexLens is a multi-modal code analysis platform designed to provide comprehensive code understanding and analysis capabilities. - -## Features - -- **Multi-language Support**: Analyze code in Python, JavaScript, TypeScript and more using Tree-sitter parsers -- **Semantic Search**: Find relevant code snippets using semantic understanding with fastembed and HNSWLIB -- **Code Parsing**: Advanced code structure parsing with tree-sitter -- **Flexible Architecture**: Modular design for easy extension and customization - -## Installation - -### Basic Installation - -```bash -pip install codex-lens -``` - -### With Semantic Search - -```bash -pip install codex-lens[semantic] -``` - -### With GPU Acceleration (NVIDIA CUDA) - -```bash -pip install codex-lens[semantic-gpu] -``` - -### With DirectML (Windows - NVIDIA/AMD/Intel) - -```bash -pip install codex-lens[semantic-directml] -``` - -### With All Optional Features - -```bash -pip install codex-lens[full] -``` - -## Requirements - -- Python >= 3.10 -- See `pyproject.toml` for detailed dependency list - -## Development - -This project uses setuptools for building and packaging. - -## License - -MIT License - -## Authors - -CodexLens Contributors diff --git a/codex-lens/src/codex_lens.egg-info/SOURCES.txt b/codex-lens/src/codex_lens.egg-info/SOURCES.txt deleted file mode 100644 index b94c2ed5..00000000 --- a/codex-lens/src/codex_lens.egg-info/SOURCES.txt +++ /dev/null @@ -1,208 +0,0 @@ -LICENSE -README.md -pyproject.toml -src/codex_lens.egg-info/PKG-INFO -src/codex_lens.egg-info/SOURCES.txt -src/codex_lens.egg-info/dependency_links.txt -src/codex_lens.egg-info/entry_points.txt -src/codex_lens.egg-info/requires.txt -src/codex_lens.egg-info/top_level.txt -src/codexlens/__init__.py -src/codexlens/__main__.py -src/codexlens/config.py -src/codexlens/entities.py -src/codexlens/env_config.py -src/codexlens/errors.py -src/codexlens/api/__init__.py -src/codexlens/api/definition.py -src/codexlens/api/file_context.py -src/codexlens/api/hover.py -src/codexlens/api/lsp_lifecycle.py -src/codexlens/api/models.py -src/codexlens/api/references.py -src/codexlens/api/semantic.py -src/codexlens/api/symbols.py -src/codexlens/api/utils.py -src/codexlens/cli/__init__.py -src/codexlens/cli/commands.py -src/codexlens/cli/embedding_manager.py -src/codexlens/cli/model_manager.py -src/codexlens/cli/output.py -src/codexlens/hybrid_search/__init__.py -src/codexlens/hybrid_search/data_structures.py -src/codexlens/indexing/__init__.py -src/codexlens/indexing/embedding.py -src/codexlens/indexing/symbol_extractor.py -src/codexlens/lsp/__init__.py -src/codexlens/lsp/handlers.py -src/codexlens/lsp/keepalive_bridge.py -src/codexlens/lsp/lsp-servers.json -src/codexlens/lsp/lsp_bridge.py -src/codexlens/lsp/lsp_graph_builder.py -src/codexlens/lsp/providers.py -src/codexlens/lsp/server.py -src/codexlens/lsp/standalone_manager.py -src/codexlens/mcp/__init__.py -src/codexlens/mcp/hooks.py -src/codexlens/mcp/provider.py -src/codexlens/mcp/schema.py -src/codexlens/parsers/__init__.py -src/codexlens/parsers/astgrep_binding.py -src/codexlens/parsers/astgrep_js_ts_processor.py -src/codexlens/parsers/astgrep_processor.py -src/codexlens/parsers/encoding.py -src/codexlens/parsers/factory.py -src/codexlens/parsers/tokenizer.py -src/codexlens/parsers/treesitter_parser.py -src/codexlens/parsers/patterns/__init__.py -src/codexlens/parsers/patterns/javascript/__init__.py -src/codexlens/parsers/patterns/python/__init__.py -src/codexlens/parsers/patterns/typescript/__init__.py -src/codexlens/search/__init__.py -src/codexlens/search/binary_searcher.py -src/codexlens/search/chain_search.py -src/codexlens/search/enrichment.py -src/codexlens/search/global_graph_expander.py -src/codexlens/search/graph_expander.py -src/codexlens/search/hybrid_search.py -src/codexlens/search/query_parser.py -src/codexlens/search/ranking.py -src/codexlens/search/association_tree/__init__.py -src/codexlens/search/association_tree/builder.py -src/codexlens/search/association_tree/data_structures.py -src/codexlens/search/association_tree/deduplicator.py -src/codexlens/search/clustering/__init__.py -src/codexlens/search/clustering/base.py -src/codexlens/search/clustering/dbscan_strategy.py -src/codexlens/search/clustering/factory.py -src/codexlens/search/clustering/frequency_strategy.py -src/codexlens/search/clustering/hdbscan_strategy.py -src/codexlens/search/clustering/noop_strategy.py -src/codexlens/semantic/__init__.py -src/codexlens/semantic/ann_index.py -src/codexlens/semantic/base.py -src/codexlens/semantic/chunker.py -src/codexlens/semantic/code_extractor.py -src/codexlens/semantic/embedder.py -src/codexlens/semantic/factory.py -src/codexlens/semantic/gpu_support.py -src/codexlens/semantic/litellm_embedder.py -src/codexlens/semantic/rotational_embedder.py -src/codexlens/semantic/vector_store.py -src/codexlens/semantic/reranker/__init__.py -src/codexlens/semantic/reranker/api_reranker.py -src/codexlens/semantic/reranker/base.py -src/codexlens/semantic/reranker/factory.py -src/codexlens/semantic/reranker/fastembed_reranker.py -src/codexlens/semantic/reranker/legacy.py -src/codexlens/semantic/reranker/litellm_reranker.py -src/codexlens/semantic/reranker/onnx_reranker.py -src/codexlens/storage/__init__.py -src/codexlens/storage/deepwiki_models.py -src/codexlens/storage/deepwiki_store.py -src/codexlens/storage/dir_index.py -src/codexlens/storage/file_cache.py -src/codexlens/storage/global_index.py -src/codexlens/storage/index_tree.py -src/codexlens/storage/merkle_tree.py -src/codexlens/storage/migration_manager.py -src/codexlens/storage/path_mapper.py -src/codexlens/storage/registry.py -src/codexlens/storage/sqlite_store.py -src/codexlens/storage/sqlite_utils.py -src/codexlens/storage/vector_meta_store.py -src/codexlens/storage/migrations/__init__.py -src/codexlens/storage/migrations/migration_001_normalize_keywords.py -src/codexlens/storage/migrations/migration_002_add_token_metadata.py -src/codexlens/storage/migrations/migration_004_dual_fts.py -src/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py -src/codexlens/storage/migrations/migration_006_enhance_relationships.py -src/codexlens/storage/migrations/migration_007_add_graph_neighbors.py -src/codexlens/storage/migrations/migration_008_add_merkle_hashes.py -src/codexlens/storage/migrations/migration_010_add_multi_vector_chunks.py -src/codexlens/tools/__init__.py -src/codexlens/tools/deepwiki_generator.py -src/codexlens/watcher/__init__.py -src/codexlens/watcher/events.py -src/codexlens/watcher/file_watcher.py -src/codexlens/watcher/incremental_indexer.py -src/codexlens/watcher/manager.py -tests/test_ann_index.py -tests/test_api_reranker.py -tests/test_association_tree.py -tests/test_astgrep_binding.py -tests/test_binary_searcher.py -tests/test_cascade_strategies.py -tests/test_chain_search.py -tests/test_cli_help.py -tests/test_cli_hybrid_search.py -tests/test_cli_output.py -tests/test_clustering_strategies.py -tests/test_code_extractor.py -tests/test_config.py -tests/test_config_cascade.py -tests/test_config_staged_env_overrides.py -tests/test_deepwiki_store.py -tests/test_deepwiki_types.py -tests/test_dual_fts.py -tests/test_embedder.py -tests/test_embedding_backend_availability.py -tests/test_encoding.py -tests/test_enrichment.py -tests/test_entities.py -tests/test_errors.py -tests/test_file_cache.py -tests/test_global_graph_expander.py -tests/test_global_index.py -tests/test_global_relationships.py -tests/test_global_symbol_index.py -tests/test_graph_expansion.py -tests/test_hybrid_chunker.py -tests/test_hybrid_search_e2e.py -tests/test_hybrid_search_reranker_backend.py -tests/test_hybrid_search_unit.py -tests/test_incremental_indexer.py -tests/test_incremental_indexing.py -tests/test_litellm_reranker.py -tests/test_lsp_graph_builder_depth.py -tests/test_merkle_detection.py -tests/test_migrations.py -tests/test_parser_integration.py -tests/test_parsers.py -tests/test_path_mapper_windows_drive.py -tests/test_performance_optimizations.py -tests/test_pure_vector_search.py -tests/test_query_parser.py -tests/test_ranking.py -tests/test_recursive_splitting.py -tests/test_registry.py -tests/test_reranker_backends.py -tests/test_reranker_factory.py -tests/test_result_grouping.py -tests/test_rrf_fusion.py -tests/test_schema_cleanup_migration.py -tests/test_search_comparison.py -tests/test_search_comprehensive.py -tests/test_search_full_coverage.py -tests/test_search_performance.py -tests/test_semantic.py -tests/test_semantic_search.py -tests/test_sqlite_store.py -tests/test_stage1_binary_search_uses_chunk_lines.py -tests/test_staged_cascade.py -tests/test_staged_cascade_lsp_depth.py -tests/test_staged_cascade_realtime_lsp.py -tests/test_staged_stage1_fallback_seed.py -tests/test_staged_stage3_fast_strategies.py -tests/test_standalone_lsp_manager_open_document_cache.py -tests/test_static_graph_integration.py -tests/test_storage.py -tests/test_storage_concurrency.py -tests/test_symbol_extractor.py -tests/test_token_chunking.py -tests/test_token_storage.py -tests/test_tokenizer.py -tests/test_tokenizer_performance.py -tests/test_treesitter_parser.py -tests/test_vector_search_full.py -tests/test_vector_store.py \ No newline at end of file diff --git a/codex-lens/src/codex_lens.egg-info/dependency_links.txt b/codex-lens/src/codex_lens.egg-info/dependency_links.txt deleted file mode 100644 index 8b137891..00000000 --- a/codex-lens/src/codex_lens.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/codex-lens/src/codex_lens.egg-info/entry_points.txt b/codex-lens/src/codex_lens.egg-info/entry_points.txt deleted file mode 100644 index efeefd53..00000000 --- a/codex-lens/src/codex_lens.egg-info/entry_points.txt +++ /dev/null @@ -1,2 +0,0 @@ -[console_scripts] -codexlens-lsp = codexlens.lsp.server:main diff --git a/codex-lens/src/codex_lens.egg-info/requires.txt b/codex-lens/src/codex_lens.egg-info/requires.txt deleted file mode 100644 index 3dfaf117..00000000 --- a/codex-lens/src/codex_lens.egg-info/requires.txt +++ /dev/null @@ -1,60 +0,0 @@ -typer~=0.9.0 -click<9,>=8.0.0 -rich~=13.0.0 -pydantic~=2.0.0 -tree-sitter~=0.20.0 -tree-sitter-python~=0.25.0 -tree-sitter-javascript~=0.25.0 -tree-sitter-typescript~=0.23.0 -pathspec~=0.11.0 -watchdog~=3.0.0 -ast-grep-py~=0.40.0 - -[clustering] -hdbscan~=0.8.1 -scikit-learn~=1.3.0 - -[encoding] -chardet~=5.0.0 - -[full] -tiktoken~=0.5.0 - -[lsp] -pygls~=1.3.0 - -[reranker] -optimum~=1.16.0 -onnxruntime~=1.15.0 -transformers~=4.36.0 - -[reranker-api] -httpx~=0.25.0 - -[reranker-legacy] -sentence-transformers~=2.2.0 - -[reranker-litellm] -ccw-litellm~=0.1.0 - -[reranker-onnx] -optimum~=1.16.0 -onnxruntime~=1.15.0 -transformers~=4.36.0 - -[semantic] -numpy~=1.26.0 -fastembed~=0.2.1 -hnswlib~=0.8.0 - -[semantic-directml] -numpy~=1.26.0 -fastembed~=0.2.1 -hnswlib~=0.8.0 -onnxruntime-directml~=1.15.0 - -[semantic-gpu] -numpy~=1.26.0 -fastembed~=0.2.1 -hnswlib~=0.8.0 -onnxruntime-gpu~=1.15.0 diff --git a/codex-lens/src/codex_lens.egg-info/top_level.txt b/codex-lens/src/codex_lens.egg-info/top_level.txt deleted file mode 100644 index e81f348f..00000000 --- a/codex-lens/src/codex_lens.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -codexlens diff --git a/codex-lens/src/codexlens/__init__.py b/codex-lens/src/codexlens/__init__.py deleted file mode 100644 index 56f2e508..00000000 --- a/codex-lens/src/codexlens/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -"""CodexLens package.""" - -from __future__ import annotations - -from . import config, entities, errors -from .config import Config -from .entities import IndexedFile, SearchResult, SemanticChunk, Symbol -from .errors import CodexLensError, ConfigError, ParseError, SearchError, StorageError - -__version__ = "0.1.0" - -__all__ = [ - "__version__", - "config", - "entities", - "errors", - "Config", - "IndexedFile", - "SearchResult", - "SemanticChunk", - "Symbol", - "CodexLensError", - "ConfigError", - "ParseError", - "StorageError", - "SearchError", -] - diff --git a/codex-lens/src/codexlens/__main__.py b/codex-lens/src/codexlens/__main__.py deleted file mode 100644 index 35190f97..00000000 --- a/codex-lens/src/codexlens/__main__.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Module entrypoint for `python -m codexlens`.""" - -from __future__ import annotations - -from codexlens.cli import app - - -def main() -> None: - app() - - -if __name__ == "__main__": - main() - diff --git a/codex-lens/src/codexlens/api/__init__.py b/codex-lens/src/codexlens/api/__init__.py deleted file mode 100644 index fd961a56..00000000 --- a/codex-lens/src/codexlens/api/__init__.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Codexlens Public API Layer. - -This module exports all public API functions and dataclasses for the -codexlens LSP-like functionality. - -Dataclasses (from models.py): - - CallInfo: Call relationship information - - MethodContext: Method context with call relationships - - FileContextResult: File context result with method summaries - - DefinitionResult: Definition lookup result - - ReferenceResult: Reference lookup result - - GroupedReferences: References grouped by definition - - SymbolInfo: Symbol information for workspace search - - HoverInfo: Hover information for a symbol - - SemanticResult: Semantic search result - -Utility functions (from utils.py): - - resolve_project: Resolve and validate project root path - - normalize_relationship_type: Normalize relationship type to canonical form - - rank_by_proximity: Rank results by file path proximity - -Example: - >>> from codexlens.api import ( - ... DefinitionResult, - ... resolve_project, - ... normalize_relationship_type - ... ) - >>> project = resolve_project("/path/to/project") - >>> rel_type = normalize_relationship_type("calls") - >>> print(rel_type) - 'call' -""" - -from __future__ import annotations - -# Dataclasses -from .models import ( - CallInfo, - MethodContext, - FileContextResult, - DefinitionResult, - ReferenceResult, - GroupedReferences, - SymbolInfo, - HoverInfo, - SemanticResult, -) - -# Utility functions -from .utils import ( - resolve_project, - normalize_relationship_type, - rank_by_proximity, - rank_by_score, -) - -# API functions -from .definition import find_definition -from .symbols import workspace_symbols -from .hover import get_hover -from .file_context import file_context -from .references import find_references -from .semantic import semantic_search -from .lsp_lifecycle import lsp_start, lsp_stop, lsp_restart - -__all__ = [ - # Dataclasses - "CallInfo", - "MethodContext", - "FileContextResult", - "DefinitionResult", - "ReferenceResult", - "GroupedReferences", - "SymbolInfo", - "HoverInfo", - "SemanticResult", - # Utility functions - "resolve_project", - "normalize_relationship_type", - "rank_by_proximity", - "rank_by_score", - # API functions - "find_definition", - "workspace_symbols", - "get_hover", - "file_context", - "find_references", - "semantic_search", - # LSP lifecycle - "lsp_start", - "lsp_stop", - "lsp_restart", -] diff --git a/codex-lens/src/codexlens/api/definition.py b/codex-lens/src/codexlens/api/definition.py deleted file mode 100644 index ecfe874b..00000000 --- a/codex-lens/src/codexlens/api/definition.py +++ /dev/null @@ -1,126 +0,0 @@ -"""find_definition API implementation. - -This module provides the find_definition() function for looking up -symbol definitions with a 3-stage fallback strategy. -""" - -from __future__ import annotations - -import logging -from pathlib import Path -from typing import List, Optional - -from ..entities import Symbol -from ..storage.global_index import GlobalSymbolIndex -from ..storage.registry import RegistryStore -from ..errors import IndexNotFoundError -from .models import DefinitionResult -from .utils import resolve_project, rank_by_proximity - -logger = logging.getLogger(__name__) - - -def find_definition( - project_root: str, - symbol_name: str, - symbol_kind: Optional[str] = None, - file_context: Optional[str] = None, - limit: int = 10 -) -> List[DefinitionResult]: - """Find definition locations for a symbol. - - Uses a 3-stage fallback strategy: - 1. Exact match with kind filter - 2. Exact match without kind filter - 3. Prefix match - - Args: - project_root: Project root directory (for index location) - symbol_name: Name of the symbol to find - symbol_kind: Optional symbol kind filter (class, function, etc.) - file_context: Optional file path for proximity ranking - limit: Maximum number of results to return - - Returns: - List of DefinitionResult sorted by proximity if file_context provided - - Raises: - IndexNotFoundError: If project is not indexed - """ - project_path = resolve_project(project_root) - - # Get project info from registry - registry = RegistryStore() - project_info = registry.get_project(project_path) - if project_info is None: - raise IndexNotFoundError(f"Project not indexed: {project_path}") - - # Open global symbol index - index_db = project_info.index_root / "_global_symbols.db" - if not index_db.exists(): - raise IndexNotFoundError(f"Global symbol index not found: {index_db}") - - global_index = GlobalSymbolIndex(str(index_db), project_info.id) - - # Stage 1: Exact match with kind filter - results = _search_with_kind(global_index, symbol_name, symbol_kind, limit) - if results: - logger.debug(f"Stage 1 (exact+kind): Found {len(results)} results for {symbol_name}") - return _rank_and_convert(results, file_context) - - # Stage 2: Exact match without kind (if kind was specified) - if symbol_kind: - results = _search_with_kind(global_index, symbol_name, None, limit) - if results: - logger.debug(f"Stage 2 (exact): Found {len(results)} results for {symbol_name}") - return _rank_and_convert(results, file_context) - - # Stage 3: Prefix match - results = global_index.search( - name=symbol_name, - kind=None, - limit=limit, - prefix_mode=True - ) - if results: - logger.debug(f"Stage 3 (prefix): Found {len(results)} results for {symbol_name}") - return _rank_and_convert(results, file_context) - - logger.debug(f"No definitions found for {symbol_name}") - return [] - - -def _search_with_kind( - global_index: GlobalSymbolIndex, - symbol_name: str, - symbol_kind: Optional[str], - limit: int -) -> List[Symbol]: - """Search for symbols with optional kind filter.""" - return global_index.search( - name=symbol_name, - kind=symbol_kind, - limit=limit, - prefix_mode=False - ) - - -def _rank_and_convert( - symbols: List[Symbol], - file_context: Optional[str] -) -> List[DefinitionResult]: - """Convert symbols to DefinitionResult and rank by proximity.""" - results = [ - DefinitionResult( - name=sym.name, - kind=sym.kind, - file_path=sym.file or "", - line=sym.range[0] if sym.range else 1, - end_line=sym.range[1] if sym.range else 1, - signature=None, # Could extract from file if needed - container=None, # Could extract from parent symbol - score=1.0 - ) - for sym in symbols - ] - return rank_by_proximity(results, file_context) diff --git a/codex-lens/src/codexlens/api/file_context.py b/codex-lens/src/codexlens/api/file_context.py deleted file mode 100644 index fafa209f..00000000 --- a/codex-lens/src/codexlens/api/file_context.py +++ /dev/null @@ -1,272 +0,0 @@ -"""file_context API implementation. - -This module provides the file_context() function for retrieving -method call graphs from a source file. -""" - -from __future__ import annotations - -import logging -import os -from pathlib import Path -from typing import List, Optional, Tuple - -from ..entities import Symbol -from ..storage.global_index import GlobalSymbolIndex -from ..storage.dir_index import DirIndexStore -from ..storage.registry import RegistryStore -from ..errors import IndexNotFoundError -from .models import ( - FileContextResult, - MethodContext, - CallInfo, -) -from .utils import resolve_project, normalize_relationship_type - -logger = logging.getLogger(__name__) - - -def file_context( - project_root: str, - file_path: str, - include_calls: bool = True, - include_callers: bool = True, - max_depth: int = 1, - format: str = "brief" -) -> FileContextResult: - """Get method call context for a code file. - - Retrieves all methods/functions in the file along with their - outgoing calls and incoming callers. - - Args: - project_root: Project root directory (for index location) - file_path: Path to the code file to analyze - include_calls: Whether to include outgoing calls - include_callers: Whether to include incoming callers - max_depth: Call chain depth (V1 only supports 1) - format: Output format (brief | detailed | tree) - - Returns: - FileContextResult with method contexts and summary - - Raises: - IndexNotFoundError: If project is not indexed - FileNotFoundError: If file does not exist - ValueError: If max_depth > 1 (V1 limitation) - """ - # V1 limitation: only depth=1 supported - if max_depth > 1: - raise ValueError( - f"max_depth > 1 not supported in V1. " - f"Requested: {max_depth}, supported: 1" - ) - - project_path = resolve_project(project_root) - file_path_resolved = Path(file_path).resolve() - - # Validate file exists - if not file_path_resolved.exists(): - raise FileNotFoundError(f"File not found: {file_path_resolved}") - - # Get project info from registry - registry = RegistryStore() - project_info = registry.get_project(project_path) - if project_info is None: - raise IndexNotFoundError(f"Project not indexed: {project_path}") - - # Open global symbol index - index_db = project_info.index_root / "_global_symbols.db" - if not index_db.exists(): - raise IndexNotFoundError(f"Global symbol index not found: {index_db}") - - global_index = GlobalSymbolIndex(str(index_db), project_info.id) - - # Get all symbols in the file - symbols = global_index.get_file_symbols(str(file_path_resolved)) - - # Filter to functions, methods, and classes - method_symbols = [ - s for s in symbols - if s.kind in ("function", "method", "class") - ] - - logger.debug(f"Found {len(method_symbols)} methods in {file_path}") - - # Try to find dir_index for relationship queries - dir_index = _find_dir_index(project_info, file_path_resolved) - - # Build method contexts - methods: List[MethodContext] = [] - outgoing_resolved = True - incoming_resolved = True - targets_resolved = True - - for symbol in method_symbols: - calls: List[CallInfo] = [] - callers: List[CallInfo] = [] - - if include_calls and dir_index: - try: - outgoing = dir_index.get_outgoing_calls( - str(file_path_resolved), - symbol.name - ) - for target_name, rel_type, line, target_file in outgoing: - calls.append(CallInfo( - symbol_name=target_name, - file_path=target_file, - line=line, - relationship=normalize_relationship_type(rel_type) - )) - if target_file is None: - targets_resolved = False - except Exception as e: - logger.debug(f"Failed to get outgoing calls: {e}") - outgoing_resolved = False - - if include_callers and dir_index: - try: - incoming = dir_index.get_incoming_calls(symbol.name) - for source_name, rel_type, line, source_file in incoming: - callers.append(CallInfo( - symbol_name=source_name, - file_path=source_file, - line=line, - relationship=normalize_relationship_type(rel_type) - )) - except Exception as e: - logger.debug(f"Failed to get incoming calls: {e}") - incoming_resolved = False - - methods.append(MethodContext( - name=symbol.name, - kind=symbol.kind, - line_range=symbol.range if symbol.range else (1, 1), - signature=None, # Could extract from source - calls=calls, - callers=callers - )) - - # Detect language from file extension - language = _detect_language(file_path_resolved) - - # Generate summary - summary = _generate_summary(file_path_resolved, methods, format) - - return FileContextResult( - file_path=str(file_path_resolved), - language=language, - methods=methods, - summary=summary, - discovery_status={ - "outgoing_resolved": outgoing_resolved, - "incoming_resolved": incoming_resolved, - "targets_resolved": targets_resolved - } - ) - - -def _find_dir_index(project_info, file_path: Path) -> Optional[DirIndexStore]: - """Find the dir_index that contains the file. - - Args: - project_info: Project information from registry - file_path: Path to the file - - Returns: - DirIndexStore if found, None otherwise - """ - try: - # Look for _index.db in file's directory or parent directories - current = file_path.parent - while current != current.parent: - index_db = current / "_index.db" - if index_db.exists(): - return DirIndexStore(str(index_db)) - - # Also check in project's index_root - relative = current.relative_to(project_info.source_root) - index_in_cache = project_info.index_root / relative / "_index.db" - if index_in_cache.exists(): - return DirIndexStore(str(index_in_cache)) - - current = current.parent - except Exception as e: - logger.debug(f"Failed to find dir_index: {e}") - - return None - - -def _detect_language(file_path: Path) -> str: - """Detect programming language from file extension. - - Args: - file_path: Path to the file - - Returns: - Language name - """ - ext_map = { - ".py": "python", - ".js": "javascript", - ".ts": "typescript", - ".jsx": "javascript", - ".tsx": "typescript", - ".go": "go", - ".rs": "rust", - ".java": "java", - ".swift": "swift", - ".c": "c", - ".cpp": "cpp", - ".h": "c", - ".hpp": "cpp", - } - return ext_map.get(file_path.suffix.lower(), "unknown") - - -def _generate_summary( - file_path: Path, - methods: List[MethodContext], - format: str -) -> str: - """Generate human-readable summary of file context. - - Args: - file_path: Path to the file - methods: List of method contexts - format: Output format (brief | detailed | tree) - - Returns: - Markdown-formatted summary - """ - lines = [f"## {file_path.name} ({len(methods)} methods)\n"] - - for method in methods: - start, end = method.line_range - lines.append(f"### {method.name} (line {start}-{end})") - - if method.calls: - calls_str = ", ".join( - f"{c.symbol_name} ({c.file_path or 'unresolved'}:{c.line})" - if format == "detailed" - else c.symbol_name - for c in method.calls - ) - lines.append(f"- Calls: {calls_str}") - - if method.callers: - callers_str = ", ".join( - f"{c.symbol_name} ({c.file_path}:{c.line})" - if format == "detailed" - else c.symbol_name - for c in method.callers - ) - lines.append(f"- Called by: {callers_str}") - - if not method.calls and not method.callers: - lines.append("- (no call relationships)") - - lines.append("") - - return "\n".join(lines) diff --git a/codex-lens/src/codexlens/api/hover.py b/codex-lens/src/codexlens/api/hover.py deleted file mode 100644 index 7860c98f..00000000 --- a/codex-lens/src/codexlens/api/hover.py +++ /dev/null @@ -1,148 +0,0 @@ -"""get_hover API implementation. - -This module provides the get_hover() function for retrieving -detailed hover information for symbols. -""" - -from __future__ import annotations - -import logging -from pathlib import Path -from typing import Optional - -from ..entities import Symbol -from ..storage.global_index import GlobalSymbolIndex -from ..storage.registry import RegistryStore -from ..errors import IndexNotFoundError -from .models import HoverInfo -from .utils import resolve_project - -logger = logging.getLogger(__name__) - - -def get_hover( - project_root: str, - symbol_name: str, - file_path: Optional[str] = None -) -> Optional[HoverInfo]: - """Get detailed hover information for a symbol. - - Args: - project_root: Project root directory (for index location) - symbol_name: Name of the symbol to look up - file_path: Optional file path to disambiguate when symbol - appears in multiple files - - Returns: - HoverInfo if symbol found, None otherwise - - Raises: - IndexNotFoundError: If project is not indexed - """ - project_path = resolve_project(project_root) - - # Get project info from registry - registry = RegistryStore() - project_info = registry.get_project(project_path) - if project_info is None: - raise IndexNotFoundError(f"Project not indexed: {project_path}") - - # Open global symbol index - index_db = project_info.index_root / "_global_symbols.db" - if not index_db.exists(): - raise IndexNotFoundError(f"Global symbol index not found: {index_db}") - - global_index = GlobalSymbolIndex(str(index_db), project_info.id) - - # Search for the symbol - results = global_index.search( - name=symbol_name, - kind=None, - limit=50, - prefix_mode=False - ) - - if not results: - logger.debug(f"No hover info found for {symbol_name}") - return None - - # If file_path provided, filter to that file - if file_path: - file_path_resolved = str(Path(file_path).resolve()) - matching = [s for s in results if s.file == file_path_resolved] - if matching: - results = matching - - # Take the first result - symbol = results[0] - - # Build hover info - return HoverInfo( - name=symbol.name, - kind=symbol.kind, - signature=_extract_signature(symbol), - documentation=_extract_documentation(symbol), - file_path=symbol.file or "", - line_range=symbol.range if symbol.range else (1, 1), - type_info=_extract_type_info(symbol) - ) - - -def _extract_signature(symbol: Symbol) -> str: - """Extract signature from symbol. - - For now, generates a basic signature based on kind and name. - In a full implementation, this would parse the actual source code. - - Args: - symbol: The symbol to extract signature from - - Returns: - Signature string - """ - if symbol.kind == "function": - return f"def {symbol.name}(...)" - elif symbol.kind == "method": - return f"def {symbol.name}(self, ...)" - elif symbol.kind == "class": - return f"class {symbol.name}" - elif symbol.kind == "variable": - return symbol.name - elif symbol.kind == "constant": - return f"{symbol.name} = ..." - else: - return f"{symbol.kind} {symbol.name}" - - -def _extract_documentation(symbol: Symbol) -> Optional[str]: - """Extract documentation from symbol. - - In a full implementation, this would parse docstrings from source. - For now, returns None. - - Args: - symbol: The symbol to extract documentation from - - Returns: - Documentation string if available, None otherwise - """ - # Would need to read source file and parse docstring - # For V1, return None - return None - - -def _extract_type_info(symbol: Symbol) -> Optional[str]: - """Extract type information from symbol. - - In a full implementation, this would parse type annotations. - For now, returns None. - - Args: - symbol: The symbol to extract type info from - - Returns: - Type info string if available, None otherwise - """ - # Would need to parse type annotations from source - # For V1, return None - return None diff --git a/codex-lens/src/codexlens/api/lsp_lifecycle.py b/codex-lens/src/codexlens/api/lsp_lifecycle.py deleted file mode 100644 index ebda4691..00000000 --- a/codex-lens/src/codexlens/api/lsp_lifecycle.py +++ /dev/null @@ -1,124 +0,0 @@ -"""LSP server lifecycle management API. - -Provides synchronous wrappers around StandaloneLspManager's async -start/stop methods for use via the executeCodexLensPythonAPI bridge. -""" - -from __future__ import annotations - -import asyncio -import shutil -from typing import Any, Dict - - -def lsp_start(workspace_root: str) -> Dict[str, Any]: - """Start the standalone LSP manager and report configured servers. - - Loads configuration and checks which language server commands are - available on the system. Does NOT start individual language servers - (they start on demand when a file of that type is opened). - - Args: - workspace_root: Absolute path to the workspace root directory. - - Returns: - Dict with keys: servers (list of server info dicts), - workspace_root (str). - """ - from codexlens.lsp.standalone_manager import StandaloneLspManager - - async def _run() -> Dict[str, Any]: - manager = StandaloneLspManager(workspace_root=workspace_root) - await manager.start() - - servers = [] - for language_id, cfg in sorted(manager._configs.items()): - cmd0 = cfg.command[0] if cfg.command else None - servers.append({ - "language_id": language_id, - "display_name": cfg.display_name, - "extensions": list(cfg.extensions), - "command": list(cfg.command), - "command_available": bool(shutil.which(cmd0)) if cmd0 else False, - }) - - # Stop the manager - individual servers are started on demand - await manager.stop() - - return { - "servers": servers, - "server_count": len(servers), - "workspace_root": workspace_root, - } - - return asyncio.run(_run()) - - -def lsp_stop(workspace_root: str) -> Dict[str, Any]: - """Stop all running language servers for the given workspace. - - Creates a temporary manager instance, starts it (loads config), - then immediately stops it -- which terminates any running server - processes that match this workspace root. - - Args: - workspace_root: Absolute path to the workspace root directory. - - Returns: - Dict confirming shutdown. - """ - from codexlens.lsp.standalone_manager import StandaloneLspManager - - async def _run() -> Dict[str, Any]: - manager = StandaloneLspManager(workspace_root=workspace_root) - await manager.start() - await manager.stop() - return {"stopped": True} - - return asyncio.run(_run()) - - -def lsp_restart(workspace_root: str) -> Dict[str, Any]: - """Restart the standalone LSP manager (stop then start). - - Equivalent to calling lsp_stop followed by lsp_start, but avoids - the overhead of two separate Python process invocations. - - Args: - workspace_root: Absolute path to the workspace root directory. - - Returns: - Dict with keys: servers, server_count, workspace_root. - """ - from codexlens.lsp.standalone_manager import StandaloneLspManager - - async def _run() -> Dict[str, Any]: - # Stop phase - stop_manager = StandaloneLspManager(workspace_root=workspace_root) - await stop_manager.start() - await stop_manager.stop() - - # Start phase - start_manager = StandaloneLspManager(workspace_root=workspace_root) - await start_manager.start() - - servers = [] - for language_id, cfg in sorted(start_manager._configs.items()): - cmd0 = cfg.command[0] if cfg.command else None - servers.append({ - "language_id": language_id, - "display_name": cfg.display_name, - "extensions": list(cfg.extensions), - "command": list(cfg.command), - "command_available": bool(shutil.which(cmd0)) if cmd0 else False, - }) - - await start_manager.stop() - - return { - "servers": servers, - "server_count": len(servers), - "workspace_root": workspace_root, - } - - return asyncio.run(_run()) diff --git a/codex-lens/src/codexlens/api/models.py b/codex-lens/src/codexlens/api/models.py deleted file mode 100644 index 6c53f690..00000000 --- a/codex-lens/src/codexlens/api/models.py +++ /dev/null @@ -1,281 +0,0 @@ -"""API dataclass definitions for codexlens LSP API. - -This module defines all result dataclasses used by the public API layer, -following the patterns established in mcp/schema.py. -""" - -from __future__ import annotations - -from dataclasses import dataclass, field, asdict -from typing import List, Optional, Dict, Tuple - - -# ============================================================================= -# Section 4.2: file_context dataclasses -# ============================================================================= - -@dataclass -class CallInfo: - """Call relationship information. - - Attributes: - symbol_name: Name of the called/calling symbol - file_path: Target file path (may be None if unresolved) - line: Line number of the call - relationship: Type of relationship (call | import | inheritance) - """ - symbol_name: str - file_path: Optional[str] - line: int - relationship: str # call | import | inheritance - - def to_dict(self) -> dict: - """Convert to dictionary, filtering None values.""" - return {k: v for k, v in asdict(self).items() if v is not None} - - -@dataclass -class MethodContext: - """Method context with call relationships. - - Attributes: - name: Method/function name - kind: Symbol kind (function | method | class) - line_range: Start and end line numbers - signature: Function signature (if available) - calls: List of outgoing calls - callers: List of incoming calls - """ - name: str - kind: str # function | method | class - line_range: Tuple[int, int] - signature: Optional[str] - calls: List[CallInfo] = field(default_factory=list) - callers: List[CallInfo] = field(default_factory=list) - - def to_dict(self) -> dict: - """Convert to dictionary, filtering None values.""" - result = { - "name": self.name, - "kind": self.kind, - "line_range": list(self.line_range), - "calls": [c.to_dict() for c in self.calls], - "callers": [c.to_dict() for c in self.callers], - } - if self.signature is not None: - result["signature"] = self.signature - return result - - -@dataclass -class FileContextResult: - """File context result with method summaries. - - Attributes: - file_path: Path to the analyzed file - language: Programming language - methods: List of method contexts - summary: Human-readable summary - discovery_status: Status flags for call resolution - """ - file_path: str - language: str - methods: List[MethodContext] - summary: str - discovery_status: Dict[str, bool] = field(default_factory=lambda: { - "outgoing_resolved": False, - "incoming_resolved": True, - "targets_resolved": False - }) - - def to_dict(self) -> dict: - """Convert to dictionary for JSON serialization.""" - return { - "file_path": self.file_path, - "language": self.language, - "methods": [m.to_dict() for m in self.methods], - "summary": self.summary, - "discovery_status": self.discovery_status, - } - - -# ============================================================================= -# Section 4.3: find_definition dataclasses -# ============================================================================= - -@dataclass -class DefinitionResult: - """Definition lookup result. - - Attributes: - name: Symbol name - kind: Symbol kind (class, function, method, etc.) - file_path: File where symbol is defined - line: Start line number - end_line: End line number - signature: Symbol signature (if available) - container: Containing class/module (if any) - score: Match score for ranking - """ - name: str - kind: str - file_path: str - line: int - end_line: int - signature: Optional[str] = None - container: Optional[str] = None - score: float = 1.0 - - def to_dict(self) -> dict: - """Convert to dictionary, filtering None values.""" - return {k: v for k, v in asdict(self).items() if v is not None} - - -# ============================================================================= -# Section 4.4: find_references dataclasses -# ============================================================================= - -@dataclass -class ReferenceResult: - """Reference lookup result. - - Attributes: - file_path: File containing the reference - line: Line number - column: Column number - context_line: The line of code containing the reference - relationship: Type of reference (call | import | type_annotation | inheritance) - """ - file_path: str - line: int - column: int - context_line: str - relationship: str # call | import | type_annotation | inheritance - - def to_dict(self) -> dict: - """Convert to dictionary.""" - return asdict(self) - - -@dataclass -class GroupedReferences: - """References grouped by definition. - - Used when a symbol has multiple definitions (e.g., overloads). - - Attributes: - definition: The definition this group refers to - references: List of references to this definition - """ - definition: DefinitionResult - references: List[ReferenceResult] = field(default_factory=list) - - def to_dict(self) -> dict: - """Convert to dictionary.""" - return { - "definition": self.definition.to_dict(), - "references": [r.to_dict() for r in self.references], - } - - -# ============================================================================= -# Section 4.5: workspace_symbols dataclasses -# ============================================================================= - -@dataclass -class SymbolInfo: - """Symbol information for workspace search. - - Attributes: - name: Symbol name - kind: Symbol kind - file_path: File where symbol is defined - line: Line number - container: Containing class/module (if any) - score: Match score for ranking - """ - name: str - kind: str - file_path: str - line: int - container: Optional[str] = None - score: float = 1.0 - - def to_dict(self) -> dict: - """Convert to dictionary, filtering None values.""" - return {k: v for k, v in asdict(self).items() if v is not None} - - -# ============================================================================= -# Section 4.6: get_hover dataclasses -# ============================================================================= - -@dataclass -class HoverInfo: - """Hover information for a symbol. - - Attributes: - name: Symbol name - kind: Symbol kind - signature: Symbol signature - documentation: Documentation string (if available) - file_path: File where symbol is defined - line_range: Start and end line numbers - type_info: Type information (if available) - """ - name: str - kind: str - signature: str - documentation: Optional[str] - file_path: str - line_range: Tuple[int, int] - type_info: Optional[str] = None - - def to_dict(self) -> dict: - """Convert to dictionary, filtering None values.""" - result = { - "name": self.name, - "kind": self.kind, - "signature": self.signature, - "file_path": self.file_path, - "line_range": list(self.line_range), - } - if self.documentation is not None: - result["documentation"] = self.documentation - if self.type_info is not None: - result["type_info"] = self.type_info - return result - - -# ============================================================================= -# Section 4.7: semantic_search dataclasses -# ============================================================================= - -@dataclass -class SemanticResult: - """Semantic search result. - - Attributes: - symbol_name: Name of the matched symbol - kind: Symbol kind - file_path: File where symbol is defined - line: Line number - vector_score: Vector similarity score (None if not available) - structural_score: Structural match score (None if not available) - fusion_score: Combined fusion score - snippet: Code snippet - match_reason: Explanation of why this matched (optional) - """ - symbol_name: str - kind: str - file_path: str - line: int - vector_score: Optional[float] - structural_score: Optional[float] - fusion_score: float - snippet: str - match_reason: Optional[str] = None - - def to_dict(self) -> dict: - """Convert to dictionary, filtering None values.""" - return {k: v for k, v in asdict(self).items() if v is not None} diff --git a/codex-lens/src/codexlens/api/references.py b/codex-lens/src/codexlens/api/references.py deleted file mode 100644 index 2e3f5f1e..00000000 --- a/codex-lens/src/codexlens/api/references.py +++ /dev/null @@ -1,345 +0,0 @@ -"""Find references API for codexlens. - -This module implements the find_references() function that wraps -ChainSearchEngine.search_references() with grouped result structure -for multi-definition symbols. -""" - -from __future__ import annotations - -import logging -from pathlib import Path -from typing import List, Optional, Dict - -from .models import ( - DefinitionResult, - ReferenceResult, - GroupedReferences, -) -from .utils import ( - resolve_project, - normalize_relationship_type, -) - - -logger = logging.getLogger(__name__) - - -def _read_line_from_file(file_path: str, line: int) -> str: - """Read a specific line from a file. - - Args: - file_path: Path to the file - line: Line number (1-based) - - Returns: - The line content, stripped of trailing whitespace. - Returns empty string if file cannot be read or line doesn't exist. - """ - try: - path = Path(file_path) - if not path.exists(): - return "" - - with path.open("r", encoding="utf-8", errors="replace") as f: - for i, content in enumerate(f, 1): - if i == line: - return content.rstrip() - return "" - except Exception as exc: - logger.debug("Failed to read line %d from %s: %s", line, file_path, exc) - return "" - - -def _transform_to_reference_result( - raw_ref: "RawReferenceResult", -) -> ReferenceResult: - """Transform raw ChainSearchEngine reference to API ReferenceResult. - - Args: - raw_ref: Raw reference result from ChainSearchEngine - - Returns: - API ReferenceResult with context_line and normalized relationship - """ - # Read the actual line from the file - context_line = _read_line_from_file(raw_ref.file_path, raw_ref.line) - - # Normalize relationship type - relationship = normalize_relationship_type(raw_ref.relationship_type) - - return ReferenceResult( - file_path=raw_ref.file_path, - line=raw_ref.line, - column=raw_ref.column, - context_line=context_line, - relationship=relationship, - ) - - -def find_references( - project_root: str, - symbol_name: str, - symbol_kind: Optional[str] = None, - include_definition: bool = True, - group_by_definition: bool = True, - limit: int = 100, -) -> List[GroupedReferences]: - """Find all reference locations for a symbol. - - Multi-definition case returns grouped results to resolve ambiguity. - - This function wraps ChainSearchEngine.search_references() and groups - the results by definition location. Each GroupedReferences contains - a definition and all references that point to it. - - Args: - project_root: Project root directory path - symbol_name: Name of the symbol to find references for - symbol_kind: Optional symbol kind filter (e.g., 'function', 'class') - include_definition: Whether to include the definition location - in the result (default True) - group_by_definition: Whether to group references by definition. - If False, returns a single group with all references. - (default True) - limit: Maximum number of references to return (default 100) - - Returns: - List of GroupedReferences. Each group contains: - - definition: The DefinitionResult for this symbol definition - - references: List of ReferenceResult pointing to this definition - - Raises: - ValueError: If project_root does not exist or is not a directory - - Examples: - >>> refs = find_references("/path/to/project", "authenticate") - >>> for group in refs: - ... print(f"Definition: {group.definition.file_path}:{group.definition.line}") - ... for ref in group.references: - ... print(f" Reference: {ref.file_path}:{ref.line} ({ref.relationship})") - - Note: - Reference relationship types are normalized: - - 'calls' -> 'call' - - 'imports' -> 'import' - - 'inherits' -> 'inheritance' - """ - # Validate and resolve project root - project_path = resolve_project(project_root) - - # Import here to avoid circular imports - from codexlens.config import Config - from codexlens.storage.registry import RegistryStore - from codexlens.storage.path_mapper import PathMapper - from codexlens.storage.global_index import GlobalSymbolIndex - from codexlens.search.chain_search import ChainSearchEngine - from codexlens.search.chain_search import ReferenceResult as RawReferenceResult - from codexlens.entities import Symbol - - # Initialize infrastructure - config = Config() - registry = RegistryStore() - mapper = PathMapper(config.index_dir) - - # Create chain search engine - engine = ChainSearchEngine(registry, mapper, config=config) - - try: - # Step 1: Find definitions for the symbol - definitions: List[DefinitionResult] = [] - - if include_definition or group_by_definition: - # Search for symbol definitions - symbols = engine.search_symbols( - name=symbol_name, - source_path=project_path, - kind=symbol_kind, - ) - - # Convert Symbol to DefinitionResult - for sym in symbols: - # Only include exact name matches for definitions - if sym.name != symbol_name: - continue - - # Optionally filter by kind - if symbol_kind and sym.kind != symbol_kind: - continue - - definitions.append(DefinitionResult( - name=sym.name, - kind=sym.kind, - file_path=sym.file or "", - line=sym.range[0] if sym.range else 1, - end_line=sym.range[1] if sym.range else 1, - signature=None, # Not available from Symbol - container=None, # Not available from Symbol - score=1.0, - )) - - # Step 2: Get all references using ChainSearchEngine - raw_references = engine.search_references( - symbol_name=symbol_name, - source_path=project_path, - depth=-1, - limit=limit, - ) - - # Step 3: Transform raw references to API ReferenceResult - api_references: List[ReferenceResult] = [] - for raw_ref in raw_references: - api_ref = _transform_to_reference_result(raw_ref) - api_references.append(api_ref) - - # Step 4: Group references by definition - if group_by_definition and definitions: - return _group_references_by_definition( - definitions=definitions, - references=api_references, - include_definition=include_definition, - ) - else: - # Return single group with placeholder definition or first definition - if definitions: - definition = definitions[0] - else: - # Create placeholder definition when no definition found - definition = DefinitionResult( - name=symbol_name, - kind=symbol_kind or "unknown", - file_path="", - line=0, - end_line=0, - signature=None, - container=None, - score=0.0, - ) - - return [GroupedReferences( - definition=definition, - references=api_references, - )] - - finally: - engine.close() - - -def _group_references_by_definition( - definitions: List[DefinitionResult], - references: List[ReferenceResult], - include_definition: bool = True, -) -> List[GroupedReferences]: - """Group references by their likely definition. - - Uses file proximity heuristic to assign references to definitions. - References in the same file or directory as a definition are - assigned to that definition. - - Args: - definitions: List of definition locations - references: List of reference locations - include_definition: Whether to include definition in results - - Returns: - List of GroupedReferences with references assigned to definitions - """ - import os - - if not definitions: - return [] - - if len(definitions) == 1: - # Single definition - all references belong to it - return [GroupedReferences( - definition=definitions[0], - references=references, - )] - - # Multiple definitions - group by proximity - groups: Dict[int, List[ReferenceResult]] = { - i: [] for i in range(len(definitions)) - } - - for ref in references: - # Find the closest definition by file proximity - best_def_idx = 0 - best_score = -1 - - for i, defn in enumerate(definitions): - score = _proximity_score(ref.file_path, defn.file_path) - if score > best_score: - best_score = score - best_def_idx = i - - groups[best_def_idx].append(ref) - - # Build result groups - result: List[GroupedReferences] = [] - for i, defn in enumerate(definitions): - # Skip definitions with no references if not including definition itself - if not include_definition and not groups[i]: - continue - - result.append(GroupedReferences( - definition=defn, - references=groups[i], - )) - - return result - - -def _proximity_score(ref_path: str, def_path: str) -> int: - """Calculate proximity score between two file paths. - - Args: - ref_path: Reference file path - def_path: Definition file path - - Returns: - Proximity score (higher = closer): - - Same file: 1000 - - Same directory: 100 - - Otherwise: common path prefix length - """ - import os - - if not ref_path or not def_path: - return 0 - - # Normalize paths - ref_path = os.path.normpath(ref_path) - def_path = os.path.normpath(def_path) - - # Same file - if ref_path == def_path: - return 1000 - - ref_dir = os.path.dirname(ref_path) - def_dir = os.path.dirname(def_path) - - # Same directory - if ref_dir == def_dir: - return 100 - - # Common path prefix - try: - common = os.path.commonpath([ref_path, def_path]) - return len(common) - except ValueError: - # No common path (different drives on Windows) - return 0 - - -# Type alias for the raw reference from ChainSearchEngine -class RawReferenceResult: - """Type stub for ChainSearchEngine.ReferenceResult. - - This is only used for type hints and is replaced at runtime - by the actual import. - """ - file_path: str - line: int - column: int - context: str - relationship_type: str diff --git a/codex-lens/src/codexlens/api/semantic.py b/codex-lens/src/codexlens/api/semantic.py deleted file mode 100644 index c442364f..00000000 --- a/codex-lens/src/codexlens/api/semantic.py +++ /dev/null @@ -1,482 +0,0 @@ -"""Semantic search API with RRF fusion. - -This module provides the semantic_search() function for combining -vector, structural, and keyword search with configurable fusion strategies. -""" - -from __future__ import annotations - -import logging -from pathlib import Path -from typing import List, Optional - -from .models import SemanticResult -from .utils import resolve_project - -logger = logging.getLogger(__name__) - - -def semantic_search( - project_root: str, - query: str, - mode: str = "fusion", - vector_weight: float = 0.5, - structural_weight: float = 0.3, - keyword_weight: float = 0.2, - fusion_strategy: str = "rrf", - staged_stage2_mode: Optional[str] = None, - kind_filter: Optional[List[str]] = None, - limit: int = 20, - include_match_reason: bool = False, -) -> List[SemanticResult]: - """Semantic search - combining vector and structural search. - - This function provides a high-level API for semantic code search, - combining vector similarity, structural (symbol + relationships), - and keyword-based search methods with configurable fusion. - - Args: - project_root: Project root directory - query: Natural language query - mode: Search mode - - vector: Vector search only - - structural: Structural search only (symbol + relationships) - - fusion: Fusion search (default) - vector_weight: Vector search weight [0, 1] (default 0.5) - structural_weight: Structural search weight [0, 1] (default 0.3) - keyword_weight: Keyword search weight [0, 1] (default 0.2) - fusion_strategy: Fusion strategy (maps to chain_search.py) - - rrf: Reciprocal Rank Fusion (recommended, default) - - staged: Staged cascade -> staged_cascade_search - - binary: Binary rerank cascade -> binary_cascade_search - - hybrid: Binary rerank cascade (backward compat) -> binary_rerank_cascade_search - - dense_rerank: Dense rerank cascade -> dense_rerank_cascade_search - staged_stage2_mode: Optional override for staged Stage-2 expansion mode - - precomputed: GraphExpander over per-dir graph_neighbors (default) - - realtime: Live LSP expansion (requires LSP availability) - - static_global_graph: GlobalGraphExpander over global_relationships - kind_filter: Symbol type filter (e.g., ["function", "class"]) - limit: Max return count (default 20) - include_match_reason: Generate match reason (heuristic, not LLM) - - Returns: - Results sorted by fusion_score - - Degradation: - - No vector index: vector_score=None, uses FTS + structural search - - No relationship data: structural_score=None, vector search only - - Examples: - >>> results = semantic_search( - ... "/path/to/project", - ... "authentication handler", - ... mode="fusion", - ... fusion_strategy="rrf" - ... ) - >>> for r in results: - ... print(f"{r.symbol_name}: {r.fusion_score:.3f}") - """ - # Validate and resolve project path - project_path = resolve_project(project_root) - - # Normalize weights to sum to 1.0 - total_weight = vector_weight + structural_weight + keyword_weight - if total_weight > 0: - vector_weight = vector_weight / total_weight - structural_weight = structural_weight / total_weight - keyword_weight = keyword_weight / total_weight - else: - # Default to equal weights if all zero - vector_weight = structural_weight = keyword_weight = 1.0 / 3.0 - - # Initialize search infrastructure - try: - from codexlens.config import Config - from codexlens.storage.registry import RegistryStore - from codexlens.storage.path_mapper import PathMapper - from codexlens.search.chain_search import ChainSearchEngine, SearchOptions - except ImportError as exc: - logger.error("Failed to import search dependencies: %s", exc) - return [] - - # Load config - config = Config.load() - - # Optional per-call override for staged cascade Stage-2 mode. - if staged_stage2_mode: - stage2 = str(staged_stage2_mode).strip().lower() - if stage2 in {"live"}: - stage2 = "realtime" - valid_stage2 = {"precomputed", "realtime", "static_global_graph"} - if stage2 in valid_stage2: - config.staged_stage2_mode = stage2 - else: - logger.debug("Ignoring invalid staged_stage2_mode: %r", staged_stage2_mode) - - # Get or create registry and mapper - # Build search options based on mode - search_options = _build_search_options( - mode=mode, - vector_weight=vector_weight, - structural_weight=structural_weight, - keyword_weight=keyword_weight, - limit=limit, - ) - - # Execute search based on fusion_strategy - try: - with RegistryStore() as registry: - mapper = PathMapper() - with ChainSearchEngine(registry, mapper, config=config) as engine: - chain_result = _execute_search( - engine=engine, - query=query, - source_path=project_path, - fusion_strategy=fusion_strategy, - options=search_options, - limit=limit, - ) - except Exception as exc: - logger.error("Search execution failed: %s", exc) - return [] - - # Transform results to SemanticResult - semantic_results = _transform_results( - results=chain_result.results, - mode=mode, - vector_weight=vector_weight, - structural_weight=structural_weight, - keyword_weight=keyword_weight, - kind_filter=kind_filter, - include_match_reason=include_match_reason, - query=query, - ) - - return semantic_results[:limit] - - -def _build_search_options( - mode: str, - vector_weight: float, - structural_weight: float, - keyword_weight: float, - limit: int, -) -> "SearchOptions": - """Build SearchOptions based on mode and weights. - - Args: - mode: Search mode (vector, structural, fusion) - vector_weight: Vector search weight - structural_weight: Structural search weight - keyword_weight: Keyword search weight - limit: Result limit - - Returns: - Configured SearchOptions - """ - from codexlens.search.chain_search import SearchOptions - - # Default options - options = SearchOptions( - total_limit=limit * 2, # Fetch extra for filtering - limit_per_dir=limit, - include_symbols=True, # Always include symbols for structural - ) - - if mode == "vector": - # Pure vector mode - options.hybrid_mode = True - options.enable_vector = True - options.pure_vector = True - options.enable_fuzzy = False - elif mode == "structural": - # Structural only - use FTS + symbols - options.hybrid_mode = True - options.enable_vector = False - options.enable_fuzzy = True - options.include_symbols = True - else: - # Fusion mode (default) - options.hybrid_mode = True - options.enable_vector = vector_weight > 0 - options.enable_fuzzy = keyword_weight > 0 - options.include_symbols = structural_weight > 0 - - # Set custom weights for RRF - if options.enable_vector and keyword_weight > 0: - options.hybrid_weights = { - "vector": vector_weight, - "exact": keyword_weight * 0.7, - "fuzzy": keyword_weight * 0.3, - } - - return options - - -def _execute_search( - engine: "ChainSearchEngine", - query: str, - source_path: Path, - fusion_strategy: str, - options: "SearchOptions", - limit: int, -) -> "ChainSearchResult": - """Execute search using appropriate strategy. - - Maps fusion_strategy to ChainSearchEngine methods: - - rrf: Standard hybrid search with RRF fusion - - staged: staged_cascade_search - - binary: binary_cascade_search - - hybrid: binary_rerank_cascade_search (backward compat) - - dense_rerank: dense_rerank_cascade_search - - Args: - engine: ChainSearchEngine instance - query: Search query - source_path: Project root path - fusion_strategy: Strategy name - options: Search options - limit: Result limit - - Returns: - ChainSearchResult from the search - """ - from codexlens.search.chain_search import ChainSearchResult - - if fusion_strategy == "staged": - # Use staged cascade search (4-stage pipeline) - return engine.staged_cascade_search( - query=query, - source_path=source_path, - k=limit, - coarse_k=limit * 5, - options=options, - ) - elif fusion_strategy == "binary": - # Use binary cascade search (binary coarse + dense fine) - return engine.binary_cascade_search( - query=query, - source_path=source_path, - k=limit, - coarse_k=limit * 5, - options=options, - ) - elif fusion_strategy == "hybrid": - # Backward compat: hybrid now maps to binary_rerank_cascade_search - return engine.binary_rerank_cascade_search( - query=query, - source_path=source_path, - k=limit, - coarse_k=limit * 5, - options=options, - ) - else: - # Default: rrf - Standard search with RRF fusion - return engine.search( - query=query, - source_path=source_path, - options=options, - ) - - -def _transform_results( - results: List, - mode: str, - vector_weight: float, - structural_weight: float, - keyword_weight: float, - kind_filter: Optional[List[str]], - include_match_reason: bool, - query: str, -) -> List[SemanticResult]: - """Transform ChainSearchEngine results to SemanticResult. - - Args: - results: List of SearchResult objects - mode: Search mode - vector_weight: Vector weight used - structural_weight: Structural weight used - keyword_weight: Keyword weight used - kind_filter: Optional symbol kind filter - include_match_reason: Whether to generate match reasons - query: Original query (for match reason generation) - - Returns: - List of SemanticResult objects - """ - semantic_results = [] - - for result in results: - # Extract symbol info - symbol_name = getattr(result, "symbol_name", None) - symbol_kind = getattr(result, "symbol_kind", None) - start_line = getattr(result, "start_line", None) - - # Use symbol object if available - if hasattr(result, "symbol") and result.symbol: - symbol_name = symbol_name or result.symbol.name - symbol_kind = symbol_kind or result.symbol.kind - if hasattr(result.symbol, "range") and result.symbol.range: - start_line = start_line or result.symbol.range[0] - - # Filter by kind if specified - if kind_filter and symbol_kind: - if symbol_kind.lower() not in [k.lower() for k in kind_filter]: - continue - - # Determine scores based on mode and metadata - metadata = getattr(result, "metadata", {}) or {} - fusion_score = result.score - - # Try to extract source scores from metadata - source_scores = metadata.get("source_scores", {}) - vector_score: Optional[float] = None - structural_score: Optional[float] = None - - if mode == "vector": - # In pure vector mode, the main score is the vector score - vector_score = result.score - structural_score = None - elif mode == "structural": - # In structural mode, no vector score - vector_score = None - structural_score = result.score - else: - # Fusion mode - try to extract individual scores - if "vector" in source_scores: - vector_score = source_scores["vector"] - elif metadata.get("fusion_method") == "simple_weighted": - # From weighted fusion - vector_score = source_scores.get("vector") - - # Structural score approximation (from exact/fuzzy FTS) - fts_scores = [] - if "exact" in source_scores: - fts_scores.append(source_scores["exact"]) - if "fuzzy" in source_scores: - fts_scores.append(source_scores["fuzzy"]) - - if fts_scores: - structural_score = max(fts_scores) - - # Build snippet - snippet = getattr(result, "excerpt", "") or getattr(result, "content", "") - if len(snippet) > 500: - snippet = snippet[:500] + "..." - - # Generate match reason if requested - match_reason = None - if include_match_reason: - match_reason = _generate_match_reason( - query=query, - symbol_name=symbol_name, - symbol_kind=symbol_kind, - snippet=snippet, - vector_score=vector_score, - structural_score=structural_score, - ) - - semantic_result = SemanticResult( - symbol_name=symbol_name or Path(result.path).stem, - kind=symbol_kind or "unknown", - file_path=result.path, - line=start_line or 1, - vector_score=vector_score, - structural_score=structural_score, - fusion_score=fusion_score, - snippet=snippet, - match_reason=match_reason, - ) - - semantic_results.append(semantic_result) - - # Sort by fusion_score descending - semantic_results.sort(key=lambda r: r.fusion_score, reverse=True) - - return semantic_results - - -def _generate_match_reason( - query: str, - symbol_name: Optional[str], - symbol_kind: Optional[str], - snippet: str, - vector_score: Optional[float], - structural_score: Optional[float], -) -> str: - """Generate human-readable match reason heuristically. - - This is a simple heuristic-based approach, not LLM-powered. - - Args: - query: Original search query - symbol_name: Symbol name if available - symbol_kind: Symbol kind if available - snippet: Code snippet - vector_score: Vector similarity score - structural_score: Structural match score - - Returns: - Human-readable explanation string - """ - reasons = [] - - # Check for direct name match - query_lower = query.lower() - query_words = set(query_lower.split()) - - if symbol_name: - name_lower = symbol_name.lower() - # Direct substring match - if query_lower in name_lower or name_lower in query_lower: - reasons.append(f"Symbol name '{symbol_name}' matches query") - # Word overlap - name_words = set(_split_camel_case(symbol_name).lower().split()) - overlap = query_words & name_words - if overlap and not reasons: - reasons.append(f"Symbol name contains: {', '.join(overlap)}") - - # Check snippet for keyword matches - snippet_lower = snippet.lower() - matching_words = [w for w in query_words if w in snippet_lower and len(w) > 2] - if matching_words and len(reasons) < 2: - reasons.append(f"Code contains keywords: {', '.join(matching_words[:3])}") - - # Add score-based reasoning - if vector_score is not None and vector_score > 0.7: - reasons.append("High semantic similarity") - elif vector_score is not None and vector_score > 0.5: - reasons.append("Moderate semantic similarity") - - if structural_score is not None and structural_score > 0.8: - reasons.append("Strong structural match") - - # Symbol kind context - if symbol_kind and len(reasons) < 3: - reasons.append(f"Matched {symbol_kind}") - - if not reasons: - reasons.append("Partial relevance based on content analysis") - - return "; ".join(reasons[:3]) - - -def _split_camel_case(name: str) -> str: - """Split camelCase and PascalCase to words. - - Args: - name: Symbol name in camelCase or PascalCase - - Returns: - Space-separated words - """ - import re - - # Insert space before uppercase letters - result = re.sub(r"([a-z])([A-Z])", r"\1 \2", name) - # Insert space before uppercase followed by lowercase - result = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", result) - # Replace underscores with spaces - result = result.replace("_", " ") - - return result diff --git a/codex-lens/src/codexlens/api/symbols.py b/codex-lens/src/codexlens/api/symbols.py deleted file mode 100644 index 8faf248f..00000000 --- a/codex-lens/src/codexlens/api/symbols.py +++ /dev/null @@ -1,146 +0,0 @@ -"""workspace_symbols API implementation. - -This module provides the workspace_symbols() function for searching -symbols across the entire workspace with prefix matching. -""" - -from __future__ import annotations - -import fnmatch -import logging -from pathlib import Path -from typing import List, Optional - -from ..entities import Symbol -from ..storage.global_index import GlobalSymbolIndex -from ..storage.registry import RegistryStore -from ..errors import IndexNotFoundError -from .models import SymbolInfo -from .utils import resolve_project - -logger = logging.getLogger(__name__) - - -def workspace_symbols( - project_root: str, - query: str, - kind_filter: Optional[List[str]] = None, - file_pattern: Optional[str] = None, - limit: int = 50 -) -> List[SymbolInfo]: - """Search for symbols across the entire workspace. - - Uses prefix matching for efficient searching. - - Args: - project_root: Project root directory (for index location) - query: Search query (prefix match) - kind_filter: Optional list of symbol kinds to include - (e.g., ["class", "function"]) - file_pattern: Optional glob pattern to filter by file path - (e.g., "*.py", "src/**/*.ts") - limit: Maximum number of results to return - - Returns: - List of SymbolInfo sorted by score - - Raises: - IndexNotFoundError: If project is not indexed - """ - project_path = resolve_project(project_root) - - # Get project info from registry - registry = RegistryStore() - project_info = registry.get_project(project_path) - if project_info is None: - raise IndexNotFoundError(f"Project not indexed: {project_path}") - - # Open global symbol index - index_db = project_info.index_root / "_global_symbols.db" - if not index_db.exists(): - raise IndexNotFoundError(f"Global symbol index not found: {index_db}") - - global_index = GlobalSymbolIndex(str(index_db), project_info.id) - - # Search with prefix matching - # If kind_filter has multiple kinds, we need to search for each - all_results: List[Symbol] = [] - - if kind_filter and len(kind_filter) > 0: - # Search for each kind separately - for kind in kind_filter: - results = global_index.search( - name=query, - kind=kind, - limit=limit, - prefix_mode=True - ) - all_results.extend(results) - else: - # Search without kind filter - all_results = global_index.search( - name=query, - kind=None, - limit=limit, - prefix_mode=True - ) - - logger.debug(f"Found {len(all_results)} symbols matching '{query}'") - - # Apply file pattern filter if specified - if file_pattern: - all_results = [ - sym for sym in all_results - if sym.file and fnmatch.fnmatch(sym.file, file_pattern) - ] - logger.debug(f"After file filter '{file_pattern}': {len(all_results)} symbols") - - # Convert to SymbolInfo and sort by relevance - symbols = [ - SymbolInfo( - name=sym.name, - kind=sym.kind, - file_path=sym.file or "", - line=sym.range[0] if sym.range else 1, - container=None, # Could extract from parent - score=_calculate_score(sym.name, query) - ) - for sym in all_results - ] - - # Sort by score (exact matches first) - symbols.sort(key=lambda s: s.score, reverse=True) - - return symbols[:limit] - - -def _calculate_score(symbol_name: str, query: str) -> float: - """Calculate relevance score for a symbol match. - - Scoring: - - Exact match: 1.0 - - Prefix match: 0.8 + 0.2 * (query_len / symbol_len) - - Case-insensitive match: 0.6 - - Args: - symbol_name: The matched symbol name - query: The search query - - Returns: - Score between 0.0 and 1.0 - """ - if symbol_name == query: - return 1.0 - - if symbol_name.lower() == query.lower(): - return 0.9 - - if symbol_name.startswith(query): - ratio = len(query) / len(symbol_name) - return 0.8 + 0.2 * ratio - - if symbol_name.lower().startswith(query.lower()): - ratio = len(query) / len(symbol_name) - return 0.6 + 0.2 * ratio - - return 0.5 diff --git a/codex-lens/src/codexlens/api/utils.py b/codex-lens/src/codexlens/api/utils.py deleted file mode 100644 index 3621533a..00000000 --- a/codex-lens/src/codexlens/api/utils.py +++ /dev/null @@ -1,153 +0,0 @@ -"""Utility functions for the codexlens API. - -This module provides helper functions for: -- Project resolution -- Relationship type normalization -- Result ranking by proximity -""" - -from __future__ import annotations - -import os -from pathlib import Path -from typing import List, Optional, TypeVar, Callable - -from .models import DefinitionResult - - -# Type variable for generic ranking -T = TypeVar('T') - - -def resolve_project(project_root: str) -> Path: - """Resolve and validate project root path. - - Args: - project_root: Path to project root (relative or absolute) - - Returns: - Resolved absolute Path - - Raises: - ValueError: If path does not exist or is not a directory - """ - path = Path(project_root).resolve() - if not path.exists(): - raise ValueError(f"Project root does not exist: {path}") - if not path.is_dir(): - raise ValueError(f"Project root is not a directory: {path}") - return path - - -# Relationship type normalization mapping -_RELATIONSHIP_NORMALIZATION = { - # Plural to singular - "calls": "call", - "imports": "import", - "inherits": "inheritance", - "uses": "use", - # Already normalized (passthrough) - "call": "call", - "import": "import", - "inheritance": "inheritance", - "use": "use", - "type_annotation": "type_annotation", -} - - -def normalize_relationship_type(relationship: str) -> str: - """Normalize relationship type to canonical form. - - Converts plural forms and variations to standard singular forms: - - 'calls' -> 'call' - - 'imports' -> 'import' - - 'inherits' -> 'inheritance' - - 'uses' -> 'use' - - Args: - relationship: Raw relationship type string - - Returns: - Normalized relationship type - - Examples: - >>> normalize_relationship_type('calls') - 'call' - >>> normalize_relationship_type('inherits') - 'inheritance' - >>> normalize_relationship_type('call') - 'call' - """ - return _RELATIONSHIP_NORMALIZATION.get(relationship.lower(), relationship) - - -def rank_by_proximity( - results: List[DefinitionResult], - file_context: Optional[str] = None -) -> List[DefinitionResult]: - """Rank results by file path proximity to context. - - V1 Implementation: Uses path-based proximity scoring. - - Scoring algorithm: - 1. Same directory: highest score (100) - 2. Otherwise: length of common path prefix - - Args: - results: List of definition results to rank - file_context: Reference file path for proximity calculation. - If None, returns results unchanged. - - Returns: - Results sorted by proximity score (highest first) - - Examples: - >>> results = [ - ... DefinitionResult(name="foo", kind="function", - ... file_path="/a/b/c.py", line=1, end_line=10), - ... DefinitionResult(name="foo", kind="function", - ... file_path="/a/x/y.py", line=1, end_line=10), - ... ] - >>> ranked = rank_by_proximity(results, "/a/b/test.py") - >>> ranked[0].file_path - '/a/b/c.py' - """ - if not file_context or not results: - return results - - def proximity_score(result: DefinitionResult) -> int: - """Calculate proximity score for a result.""" - result_dir = os.path.dirname(result.file_path) - context_dir = os.path.dirname(file_context) - - # Same directory gets highest score - if result_dir == context_dir: - return 100 - - # Otherwise, score by common path prefix length - try: - common = os.path.commonpath([result.file_path, file_context]) - return len(common) - except ValueError: - # No common path (different drives on Windows) - return 0 - - return sorted(results, key=proximity_score, reverse=True) - - -def rank_by_score( - results: List[T], - score_fn: Callable[[T], float], - reverse: bool = True -) -> List[T]: - """Generic ranking function by custom score. - - Args: - results: List of items to rank - score_fn: Function to extract score from item - reverse: If True, highest scores first (default) - - Returns: - Sorted list - """ - return sorted(results, key=score_fn, reverse=reverse) diff --git a/codex-lens/src/codexlens/cli/__init__.py b/codex-lens/src/codexlens/cli/__init__.py deleted file mode 100644 index 18523b4c..00000000 --- a/codex-lens/src/codexlens/cli/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -"""CLI package for CodexLens.""" - -from __future__ import annotations - -import sys -import os - -# Force UTF-8 encoding for Windows console -# This ensures Chinese characters display correctly instead of GBK garbled text -if sys.platform == "win32": - # Set environment variable for Python I/O encoding - os.environ.setdefault("PYTHONIOENCODING", "utf-8") - - # Reconfigure stdout/stderr to use UTF-8 if possible - try: - if hasattr(sys.stdout, "reconfigure"): - sys.stdout.reconfigure(encoding="utf-8", errors="replace") - if hasattr(sys.stderr, "reconfigure"): - sys.stderr.reconfigure(encoding="utf-8", errors="replace") - except Exception: - # Fallback: some environments don't support reconfigure - pass - -from .commands import app - -__all__ = ["app"] - diff --git a/codex-lens/src/codexlens/cli/commands.py b/codex-lens/src/codexlens/cli/commands.py deleted file mode 100644 index 2f49e706..00000000 --- a/codex-lens/src/codexlens/cli/commands.py +++ /dev/null @@ -1,4942 +0,0 @@ -"""Typer commands for CodexLens.""" - -from __future__ import annotations - -import inspect -import json -import logging -import os -import re -import shutil -import subprocess -from pathlib import Path -from typing import Annotated, Any, Dict, Iterable, List, Optional - -import typer -from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn -from rich.table import Table - -from codexlens.config import Config -from codexlens.entities import IndexedFile, SearchResult, Symbol -from codexlens.errors import CodexLensError, ConfigError, ParseError, StorageError, SearchError -from codexlens.parsers.factory import ParserFactory -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore, ProjectInfo -from codexlens.storage.index_tree import IndexTreeBuilder -from codexlens.storage.dir_index import DirIndexStore -from codexlens.search.chain_search import ChainSearchEngine, SearchOptions -from codexlens.search.ranking import ( - QueryIntent, - apply_path_penalties, - detect_query_intent, - query_prefers_lexical_search, - query_targets_generated_files, -) -from codexlens.watcher import WatcherManager, WatcherConfig - -from .output import ( - console, - print_json, - render_file_inspect, - render_search_results, - render_status, - render_symbols, -) - -app = typer.Typer(help="CodexLens CLI — local code indexing and search.") -# Index subcommand group for reorganized commands -def _patch_typer_click_help_compat() -> None: - """Patch Typer help rendering for Click versions that pass ctx to make_metavar().""" - import click.core - from typer.core import TyperArgument - - try: - params = inspect.signature(TyperArgument.make_metavar).parameters - except (TypeError, ValueError): - return - - if len(params) != 1: - return - - def _compat_make_metavar(self, ctx=None): # type: ignore[override] - if self.metavar is not None: - return self.metavar - - var = (self.name or "").upper() - if not self.required: - var = f"[{var}]" - - try: - type_var = self.type.get_metavar(param=self, ctx=ctx) - except TypeError: - try: - type_var = self.type.get_metavar(self, ctx) - except TypeError: - type_var = self.type.get_metavar(self) - - if type_var: - var += f":{type_var}" - if self.nargs != 1: - var += "..." - return var - - TyperArgument.make_metavar = _compat_make_metavar - - param_params = inspect.signature(click.core.Parameter.make_metavar).parameters - if len(param_params) == 2: - original_param_make_metavar = click.core.Parameter.make_metavar - - def _compat_param_make_metavar(self, ctx=None): # type: ignore[override] - return original_param_make_metavar(self, ctx) - - click.core.Parameter.make_metavar = _compat_param_make_metavar - - -_patch_typer_click_help_compat() - - -# Index subcommand group for reorganized commands -index_app = typer.Typer(help="Index management commands (init, embeddings, binary, status, migrate, all)") -app.add_typer(index_app, name="index") - - -def _deprecated_command_warning(old_name: str, new_name: str) -> None: - """Display deprecation warning for renamed commands. - - Args: - old_name: The old command name being deprecated - new_name: The new command name to use instead - """ - console.print( - f"[yellow]Warning:[/yellow] '{old_name}' is deprecated. " - f"Use '{new_name}' instead." - ) - - -def _configure_logging(verbose: bool, json_mode: bool = False) -> None: - """Configure logging level. - - In JSON mode, suppress INFO logs to keep stderr clean for error parsing. - Only WARNING and above are shown to avoid mixing logs with JSON output. - """ - if json_mode and not verbose: - # In JSON mode, suppress INFO logs to keep stderr clean - level = logging.WARNING - else: - level = logging.DEBUG if verbose else logging.INFO - logging.basicConfig(level=level, format="%(levelname)s %(message)s") - - -def _parse_languages(raw: Optional[List[str]]) -> Optional[List[str]]: - if not raw: - return None - langs: List[str] = [] - for item in raw: - for part in item.split(","): - part = part.strip() - if part: - langs.append(part) - return langs or None - - -def _fail_mutually_exclusive(option_a: str, option_b: str, json_mode: bool) -> None: - msg = f"Options {option_a} and {option_b} are mutually exclusive." - if json_mode: - print_json(success=False, error=msg) - else: - console.print(f"[red]Error:[/red] {msg}") - raise typer.Exit(code=1) - - -def _extract_embedding_error(embed_result: Dict[str, Any]) -> str: - """Best-effort error extraction for embedding generation results.""" - raw_error = embed_result.get("error") - if isinstance(raw_error, str) and raw_error.strip(): - return raw_error.strip() - - result = embed_result.get("result") - if isinstance(result, dict): - details = result.get("details") - if isinstance(details, list): - collected: List[str] = [] - for item in details: - if not isinstance(item, dict): - continue - item_error = item.get("error") - if isinstance(item_error, str) and item_error.strip(): - collected.append(item_error.strip()) - - if collected: - # De-dupe while preserving order, then keep output short. - seen: set[str] = set() - unique: List[str] = [] - for err in collected: - if err not in seen: - seen.add(err) - unique.append(err) - return "; ".join(unique[:3]) - - return "Embedding generation failed (no error details provided)" - - -def _auto_select_search_method(query: str) -> str: - """Choose a default search method from query intent.""" - if query_targets_generated_files(query) or query_prefers_lexical_search(query): - return "fts" - - intent = detect_query_intent(query) - if intent == QueryIntent.KEYWORD: - return "fts" - if intent == QueryIntent.SEMANTIC: - return "dense_rerank" - return "hybrid" - - -_CLI_NON_CODE_EXTENSIONS = { - "md", "txt", "json", "yaml", "yml", "xml", "csv", "log", - "ini", "cfg", "conf", "toml", "env", "properties", - "html", "htm", "svg", "png", "jpg", "jpeg", "gif", "ico", "webp", - "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", - "lock", "sum", "mod", -} -_FALLBACK_ARTIFACT_DIRS = { - "dist", - "build", - "out", - "coverage", - "htmlcov", - ".cache", - ".workflow", - ".next", - ".nuxt", - ".parcel-cache", - ".turbo", - "tmp", - "temp", - "generated", -} -_FALLBACK_SOURCE_DIRS = { - "src", - "lib", - "core", - "app", - "server", - "client", - "services", -} - - -def _normalize_extension_filters(exclude_extensions: Optional[Iterable[str]]) -> set[str]: - """Normalize extension filters to lowercase values without leading dots.""" - normalized: set[str] = set() - for ext in exclude_extensions or []: - cleaned = (ext or "").strip().lower().lstrip(".") - if cleaned: - normalized.add(cleaned) - return normalized - - -def _score_filesystem_fallback_match( - query: str, - path_text: str, - line_text: str, - *, - base_score: float, -) -> float: - """Score filesystem fallback hits with light source-aware heuristics.""" - score = max(0.0, float(base_score)) - if score <= 0: - return 0.0 - - query_intent = detect_query_intent(query) - if query_intent != QueryIntent.KEYWORD: - return score - - path_parts = { - part.casefold() - for part in str(path_text).replace("\\", "/").split("/") - if part and part != "." - } - if _FALLBACK_SOURCE_DIRS.intersection(path_parts): - score *= 1.15 - - symbol = (query or "").strip() - if " " in symbol or not symbol: - return score - - escaped_symbol = re.escape(symbol) - definition_patterns = ( - rf"^\s*(?:export\s+)?(?:async\s+)?def\s+{escaped_symbol}\b", - rf"^\s*(?:export\s+)?(?:async\s+)?function\s+{escaped_symbol}\b", - rf"^\s*(?:export\s+)?class\s+{escaped_symbol}\b", - rf"^\s*(?:export\s+)?interface\s+{escaped_symbol}\b", - rf"^\s*(?:export\s+)?type\s+{escaped_symbol}\b", - rf"^\s*(?:export\s+)?(?:const|let|var)\s+{escaped_symbol}\b", - ) - if any(re.search(pattern, line_text) for pattern in definition_patterns): - score *= 1.8 - - return score - - -def _filesystem_fallback_search( - query: str, - search_path: Path, - *, - limit: int, - config: Config, - code_only: bool = False, - exclude_extensions: Optional[Iterable[str]] = None, -) -> Optional[dict[str, Any]]: - """Fallback to ripgrep when indexed keyword search returns no results.""" - rg_path = shutil.which("rg") - if not rg_path or not query.strip(): - return None - - import time - - allow_generated = query_targets_generated_files(query) - ignored_dirs = {name for name in IndexTreeBuilder.IGNORE_DIRS if name} - ignored_dirs.add(".workflow") - if allow_generated: - ignored_dirs.difference_update(_FALLBACK_ARTIFACT_DIRS) - - excluded_exts = _normalize_extension_filters(exclude_extensions) - if code_only: - excluded_exts.update(_CLI_NON_CODE_EXTENSIONS) - - args = [ - rg_path, - "--json", - "--line-number", - "--fixed-strings", - "--smart-case", - "--max-count", - "1", - ] - if allow_generated: - args.append("--hidden") - - for dirname in sorted(ignored_dirs): - args.extend(["--glob", f"!**/{dirname}/**"]) - - args.extend([query, str(search_path)]) - - start_time = time.perf_counter() - proc = subprocess.run( - args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - encoding="utf-8", - errors="replace", - check=False, - ) - - if proc.returncode not in (0, 1): - return None - - matches: List[SearchResult] = [] - seen_paths: set[str] = set() - for raw_line in proc.stdout.splitlines(): - if len(matches) >= limit: - break - try: - event = json.loads(raw_line) - except json.JSONDecodeError: - continue - if event.get("type") != "match": - continue - - data = event.get("data") or {} - path_text = ((data.get("path") or {}).get("text") or "").strip() - if not path_text or path_text in seen_paths: - continue - - path_obj = Path(path_text) - extension = path_obj.suffix.lower().lstrip(".") - if extension and extension in excluded_exts: - continue - if code_only and config.language_for_path(path_obj) is None: - continue - - line_text = ((data.get("lines") or {}).get("text") or "").rstrip("\r\n") - line_number = data.get("line_number") - seen_paths.add(path_text) - base_score = float(limit - len(matches)) - matches.append( - SearchResult( - path=path_text, - score=_score_filesystem_fallback_match( - query, - path_text, - line_text, - base_score=base_score, - ), - excerpt=line_text.strip() or line_text or path_text, - content=None, - metadata={ - "filesystem_fallback": True, - "backend": "ripgrep-fallback", - "stale_index_suspected": True, - }, - start_line=line_number, - end_line=line_number, - ) - ) - - if not matches: - return None - - matches = apply_path_penalties( - matches, - query, - test_file_penalty=config.test_file_penalty, - generated_file_penalty=config.generated_file_penalty, - ) - return { - "results": matches, - "time_ms": (time.perf_counter() - start_time) * 1000.0, - "fallback": { - "backend": "ripgrep-fallback", - "stale_index_suspected": True, - "reason": "Indexed FTS search returned no results; filesystem fallback used.", - }, - } - - -def _remove_tree_best_effort(target: Path) -> dict[str, Any]: - """Remove a directory tree without aborting on locked files.""" - target = target.resolve() - if not target.exists(): - return { - "removed": True, - "partial": False, - "locked_paths": [], - "errors": [], - "remaining_path": None, - } - - locked_paths: List[str] = [] - errors: List[str] = [] - entries = sorted(target.rglob("*"), key=lambda path: len(path.parts), reverse=True) - - for entry in entries: - try: - if entry.is_dir() and not entry.is_symlink(): - entry.rmdir() - else: - entry.unlink() - except FileNotFoundError: - continue - except PermissionError: - locked_paths.append(str(entry)) - except OSError as exc: - if entry.is_dir(): - continue - errors.append(f"{entry}: {exc}") - - try: - target.rmdir() - except FileNotFoundError: - pass - except PermissionError: - locked_paths.append(str(target)) - except OSError: - pass - - return { - "removed": not target.exists(), - "partial": target.exists(), - "locked_paths": sorted(set(locked_paths)), - "errors": errors, - "remaining_path": str(target) if target.exists() else None, - } - - -def _get_index_root() -> Path: - """Get the index root directory from config or default. - - Priority order: - 1. CODEXLENS_INDEX_DIR environment variable - 2. index_dir from ~/.codexlens/config.json - 3. Default: ~/.codexlens/indexes - """ - env_override = os.getenv("CODEXLENS_INDEX_DIR") - if env_override: - return Path(env_override).expanduser().resolve() - - # Read from config.json - config_file = Path.home() / ".codexlens" / "config.json" - if config_file.exists(): - try: - cfg = json.loads(config_file.read_text(encoding="utf-8")) - if "index_dir" in cfg: - return Path(cfg["index_dir"]).expanduser().resolve() - except (json.JSONDecodeError, OSError): - pass # Fall through to default - - return Path.home() / ".codexlens" / "indexes" - - -def _get_registry_path() -> Path: - """Get the registry database path.""" - env_override = os.getenv("CODEXLENS_DATA_DIR") - if env_override: - return Path(env_override).expanduser().resolve() / "registry.db" - return Path.home() / ".codexlens" / "registry.db" - - -@index_app.command("init") -def index_init( - path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to index."), - language: Optional[List[str]] = typer.Option( - None, - "--language", - "-l", - help="Limit indexing to specific languages (repeat or comma-separated).", - ), - workers: Optional[int] = typer.Option(None, "--workers", "-w", min=1, help="Parallel worker processes (default: auto-detect based on CPU count)."), - force: bool = typer.Option(False, "--force", "-f", help="Force full reindex (skip incremental mode)."), - no_embeddings: bool = typer.Option(False, "--no-embeddings", help="Skip automatic embedding generation (if semantic deps installed)."), - backend: Optional[str] = typer.Option(None, "--backend", "-b", help="Embedding backend: fastembed (local) or litellm (remote API). Defaults to settings.json config."), - model: Optional[str] = typer.Option(None, "--model", "-m", help="Embedding model: profile name for fastembed or model name for litellm. Defaults to settings.json config."), - use_astgrep: bool = typer.Option( - False, - "--use-astgrep", - help="Prefer ast-grep parsers when available (experimental). Overrides settings.json config.", - ), - no_use_astgrep: bool = typer.Option( - False, - "--no-use-astgrep", - help="Disable ast-grep parsers. Overrides settings.json config.", - ), - static_graph: bool = typer.Option( - False, - "--static-graph", - help="Persist global relationships during indexing for static graph expansion. Overrides settings.json config.", - ), - no_static_graph: bool = typer.Option( - False, - "--no-static-graph", - help="Disable persisting global relationships. Overrides settings.json config.", - ), - static_graph_types: Optional[str] = typer.Option( - None, - "--static-graph-types", - help="Comma-separated relationship types to persist: imports,inherits,calls. Overrides settings.json config.", - ), - max_workers: int = typer.Option(1, "--max-workers", min=1, help="Max concurrent API calls for embedding generation. Recommended: 4-8 for litellm backend."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Initialize or rebuild the index for a directory. - - Indexes are stored in ~/.codexlens/indexes/ with mirrored directory structure. - Set CODEXLENS_INDEX_DIR to customize the index location. - - By default, uses incremental indexing (skip unchanged files). - Use --force to rebuild all files regardless of modification time. - - If semantic search dependencies are installed, automatically generates embeddings - after indexing completes. Use --no-embeddings to skip this step. - - Backend Options (--backend): - - fastembed: Local ONNX-based embeddings (default, no API calls) - - litellm: Remote API embeddings via ccw-litellm (requires API keys) - - Model Options (--model): - - For fastembed backend: Use profile names (fast, code, multilingual, balanced) - - For litellm backend: Use model names (e.g., text-embedding-3-small, text-embedding-ada-002) - """ - _configure_logging(verbose, json_mode) - config = Config() - - # Fallback to settings.json config if CLI params not provided - config.load_settings() # Ensure settings are loaded - - # Apply CLI overrides for parsing/indexing behavior - if use_astgrep and no_use_astgrep: - _fail_mutually_exclusive("--use-astgrep", "--no-use-astgrep", json_mode) - if use_astgrep: - config.use_astgrep = True - elif no_use_astgrep: - config.use_astgrep = False - - if static_graph and no_static_graph: - _fail_mutually_exclusive("--static-graph", "--no-static-graph", json_mode) - if static_graph: - config.static_graph_enabled = True - elif no_static_graph: - config.static_graph_enabled = False - if static_graph_types is not None: - allowed = {"imports", "inherits", "calls"} - parsed = [ - t.strip().lower() - for t in static_graph_types.split(",") - if t.strip() - ] - invalid = [t for t in parsed if t not in allowed] - if invalid: - msg = ( - "Invalid --static-graph-types. Must be a comma-separated list of: " - f"{', '.join(sorted(allowed))}. Got: {invalid}" - ) - if json_mode: - print_json(success=False, error=msg) - else: - console.print(f"[red]Error:[/red] {msg}") - raise typer.Exit(code=1) - if parsed: - config.static_graph_relationship_types = parsed - - actual_backend = backend or config.embedding_backend - actual_model = model or config.embedding_model - - languages = _parse_languages(language) - base_path = path.expanduser().resolve() - - registry: RegistryStore | None = None - try: - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - builder = IndexTreeBuilder(registry, mapper, config, incremental=not force) - - if force: - console.print(f"[bold]Building index for:[/bold] {base_path} [yellow](FULL reindex)[/yellow]") - else: - console.print(f"[bold]Building index for:[/bold] {base_path} [dim](incremental)[/dim]") - - build_result = builder.build( - source_root=base_path, - languages=languages, - workers=workers, - force_full=force, - ) - - result = { - "path": str(base_path), - "files_indexed": build_result.total_files, - "dirs_indexed": build_result.total_dirs, - "index_root": str(build_result.index_root), - "project_id": build_result.project_id, - "languages": languages or sorted(config.supported_languages.keys()), - "errors": len(build_result.errors), - } - - if not json_mode: - console.print(f"[green]OK[/green] Indexed [bold]{build_result.total_files}[/bold] files in [bold]{build_result.total_dirs}[/bold] directories") - console.print(f" Index root: {build_result.index_root}") - if build_result.errors: - console.print(f" [yellow]Warnings:[/yellow] {len(build_result.errors)} errors") - - # Auto-generate embeddings if the requested backend is available - if not no_embeddings: - try: - from codexlens.semantic import is_embedding_backend_available - from codexlens.cli.embedding_manager import generate_embeddings_recursive, get_embeddings_status - - # Validate embedding backend - valid_backends = ["fastembed", "litellm"] - if actual_backend not in valid_backends: - error_msg = f"Invalid embedding backend: {actual_backend}. Must be one of: {', '.join(valid_backends)}" - if json_mode: - print_json(success=False, error=error_msg) - else: - console.print(f"[red]Error:[/red] {error_msg}") - raise typer.Exit(code=1) - - backend_available, backend_error = is_embedding_backend_available(actual_backend) - - if backend_available: - # Use the index root directory (not the _index.db file) - index_root = Path(build_result.index_root) - - if not json_mode: - console.print("\n[bold]Generating embeddings...[/bold]") - console.print(f"Backend: [cyan]{actual_backend}[/cyan]") - console.print(f"Model: [cyan]{actual_model}[/cyan]") - else: - # Output progress message for JSON mode (parsed by Node.js) - print("Generating embeddings...", flush=True) - - # Progress callback - outputs progress for both json and non-json modes - # Node.js parseProgressLine() expects formats like: - # - "Batch X: N files, M chunks" - # - "Processing N files" - # - "Finalizing index" - def progress_update(msg: str): - if json_mode: - # Output without prefix so Node.js can parse it - # Strip leading spaces that embedding_manager adds - print(msg.strip(), flush=True) - elif verbose: - console.print(f" {msg}") - - embed_result = generate_embeddings_recursive( - index_root, - embedding_backend=actual_backend, - model_profile=actual_model, - force=False, # Don't force regenerate during init - chunk_size=2000, - progress_callback=progress_update, # Always use callback - max_workers=max_workers, - ) - - if embed_result["success"]: - embed_data = embed_result["result"] - - # Output completion message for Node.js to parse - if json_mode: - print(f"Embeddings complete: {embed_data['total_chunks_created']} chunks", flush=True) - - # Get comprehensive coverage statistics - status_result = get_embeddings_status(index_root) - if status_result["success"]: - coverage = status_result["result"] - result["embeddings"] = { - "generated": True, - "total_indexes": coverage["total_indexes"], - "total_files": coverage["total_files"], - "files_with_embeddings": coverage["files_with_embeddings"], - "coverage_percent": coverage["coverage_percent"], - "total_chunks": coverage["total_chunks"], - } - else: - result["embeddings"] = { - "generated": True, - "total_chunks": embed_data["total_chunks_created"], - "files_processed": embed_data["total_files_processed"], - } - - if not json_mode: - console.print(f"[green]✓[/green] Generated embeddings for [bold]{embed_data['total_files_processed']}[/bold] files") - console.print(f" Total chunks: [bold]{embed_data['total_chunks_created']}[/bold]") - console.print(f" Indexes processed: [bold]{embed_data['indexes_successful']}/{embed_data['indexes_processed']}[/bold]") - else: - if not json_mode: - error_msg = _extract_embedding_error(embed_result) - console.print(f"[yellow]Warning:[/yellow] Embedding generation failed: {error_msg}") - result["embeddings"] = { - "generated": False, - "error": _extract_embedding_error(embed_result), - } - else: - if not json_mode and verbose: - console.print(f"[dim]Embedding backend '{actual_backend}' not available. Skipping embeddings.[/dim]") - result["embeddings"] = { - "generated": False, - "error": backend_error or "Embedding backend not available", - } - except Exception as e: - if not json_mode and verbose: - console.print(f"[yellow]Warning:[/yellow] Could not generate embeddings: {e}") - result["embeddings"] = { - "generated": False, - "error": str(e), - } - else: - result["embeddings"] = { - "generated": False, - "error": "Skipped (--no-embeddings)", - } - - # Output final JSON result with embeddings status - if json_mode: - print_json(success=True, result=result) - - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Init failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except ConfigError as exc: - if json_mode: - print_json(success=False, error=f"Configuration error: {exc}") - else: - console.print(f"[red]Init failed (config):[/red] {exc}") - raise typer.Exit(code=1) - except ParseError as exc: - if json_mode: - print_json(success=False, error=f"Parse error: {exc}") - else: - console.print(f"[red]Init failed (parse):[/red] {exc}") - raise typer.Exit(code=1) - except PermissionError as exc: - if json_mode: - print_json(success=False, error=f"Permission denied: {exc}") - else: - console.print(f"[red]Init failed (permission denied):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Init failed:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if registry is not None: - registry.close() - - -@app.command() -def watch( - path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to watch."), - language: Optional[List[str]] = typer.Option( - None, - "--language", - "-l", - help="Limit watching to specific languages (repeat or comma-separated).", - ), - debounce: int = typer.Option(1000, "--debounce", "-d", min=100, max=10000, help="Debounce interval in milliseconds."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose logging."), -) -> None: - """Watch directory for changes and update index incrementally. - - Monitors filesystem events and automatically updates the index - when files are created, modified, or deleted. - - The directory must already be indexed (run 'codexlens init' first). - - Press Ctrl+C to stop watching. - - Examples: - codexlens watch . - codexlens watch /path/to/project --debounce 500 --verbose - codexlens watch . --language python,typescript - """ - _configure_logging(verbose) - - from codexlens.watcher.events import IndexResult - - base_path = path.expanduser().resolve() - - # Check if path is indexed - mapper = PathMapper() - index_db = mapper.source_to_index_db(base_path) - if not index_db.exists(): - console.print(f"[red]Error:[/red] Directory not indexed: {base_path}") - console.print("Run 'codexlens init' first to create the index.") - raise typer.Exit(code=1) - - # Parse languages - languages = _parse_languages(language) - - # Create watcher config - watcher_config = WatcherConfig( - debounce_ms=debounce, - languages=languages, - ) - - # Callback for indexed files - def on_indexed(result: IndexResult) -> None: - if result.files_indexed > 0: - console.print(f" [green]Indexed:[/green] {result.files_indexed} files ({result.symbols_added} symbols)") - if result.files_removed > 0: - console.print(f" [yellow]Removed:[/yellow] {result.files_removed} files") - if result.errors: - for error in result.errors[:3]: # Show first 3 errors - console.print(f" [red]Error:[/red] {error}") - - console.print(f"[bold]Watching:[/bold] {base_path}") - console.print(f" Debounce: {debounce}ms") - if languages: - console.print(f" Languages: {', '.join(languages)}") - console.print(" Press Ctrl+C to stop.\n") - - manager: WatcherManager | None = None - try: - watch_config = Config.load() - manager = WatcherManager( - root_path=base_path, - config=watch_config, - watcher_config=watcher_config, - on_indexed=on_indexed, - ) - manager.start() - manager.wait() - except KeyboardInterrupt: - pass - except Exception as exc: - console.print(f"[red]Error:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if manager is not None: - manager.stop() - console.print("\n[dim]Watcher stopped.[/dim]") - - -@app.command() -def search( - query: str = typer.Argument(..., help="Search query."), - path: Path = typer.Option(Path("."), "--path", "-p", help="Directory to search from."), - limit: int = typer.Option(20, "--limit", "-n", min=1, max=500, help="Max results."), - offset: int = typer.Option(0, "--offset", min=0, help="Pagination offset - skip first N results."), - depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited, 0 = current only)."), - files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."), - method: str = typer.Option("auto", "--method", "-m", help="Search method: 'auto' (intent-aware, default), 'dense_rerank' (semantic), 'fts' (exact keyword)."), - use_fuzzy: bool = typer.Option(False, "--use-fuzzy", help="Enable fuzzy matching in FTS method."), - code_only: bool = typer.Option(False, "--code-only", help="Only return code files (excludes md, txt, json, yaml, xml, etc.)."), - exclude_extensions: Optional[str] = typer.Option(None, "--exclude-extensions", help="Comma-separated list of file extensions to exclude (e.g., 'md,txt,json')."), - # Hidden advanced options for backward compatibility - weights: Optional[str] = typer.Option( - None, - "--weights", "-w", - hidden=True, - help="[Advanced] RRF weights as key=value pairs." - ), - cascade_strategy: Optional[str] = typer.Option( - None, - "--cascade-strategy", - hidden=True, - help="[Advanced] Cascade strategy for --method cascade." - ), - staged_stage2_mode: Optional[str] = typer.Option( - None, - "--staged-stage2-mode", - hidden=True, - help="[Advanced] Stage 2 expansion mode for cascade strategy 'staged': precomputed | realtime | static_global_graph.", - ), - # Hidden deprecated parameter for backward compatibility - mode: Optional[str] = typer.Option(None, "--mode", hidden=True, help="[DEPRECATED] Use --method instead."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Search indexed file contents. - - Uses chain search across directory indexes. - Use --depth to limit search recursion (0 = current dir only). - - Search Methods: - - auto (default): Intent-aware routing. KEYWORD -> fts, MIXED -> hybrid, - SEMANTIC -> dense_rerank. - - dense_rerank: Semantic search using Dense embedding coarse retrieval + - Cross-encoder reranking. Best for natural language queries and code understanding. - - fts: Full-text search using FTS5 (unicode61 tokenizer). Best for exact code - identifiers like function/class names. Use --use-fuzzy for typo tolerance. - - Method Selection Guide: - - Code identifiers (function/class names): auto or fts - - Natural language queries: auto or dense_rerank - - Typo-tolerant search: fts --use-fuzzy - - Requirements: - The dense_rerank method requires pre-generated embeddings. - Use 'codexlens embeddings-generate' to create embeddings first. - - Examples: - # Default intent-aware search - codexlens search "authentication logic" - - # Exact code identifier search - codexlens search "authenticate_user" --method fts - - # Typo-tolerant fuzzy search - codexlens search "authentcate" --method fts --use-fuzzy - """ - _configure_logging(verbose, json_mode) - search_path = path.expanduser().resolve() - - # Handle deprecated --mode parameter - actual_method = method - if mode is not None: - # Show deprecation warning - if not json_mode: - console.print("[yellow]Warning: --mode is deprecated, use --method instead.[/yellow]") - - # Map old mode values to new method values - mode_to_method = { - "auto": "auto", - "exact": "fts", - "fuzzy": "fts", # with use_fuzzy=True - "hybrid": "hybrid", - "vector": "vector", - "pure-vector": "vector", - } - - if mode in mode_to_method: - actual_method = mode_to_method[mode] - # Enable fuzzy for old fuzzy mode - if mode == "fuzzy": - use_fuzzy = True - else: - if json_mode: - print_json(success=False, error=f"Invalid deprecated mode: {mode}. Use --method instead.") - else: - console.print(f"[red]Invalid deprecated mode:[/red] {mode}") - console.print("[dim]Use --method with: fts, vector, hybrid, cascade[/dim]") - raise typer.Exit(code=1) - - # Configure search (load settings from file) - config = Config.load() - - # Validate method - simplified interface exposes only dense_rerank and fts - # Other methods (vector, hybrid, cascade) are hidden but still work for backward compatibility - valid_methods = ["auto", "fts", "dense_rerank", "vector", "hybrid", "cascade"] - if actual_method not in valid_methods: - if json_mode: - print_json(success=False, error=f"Invalid method: {actual_method}. Use 'auto', 'dense_rerank', or 'fts'.") - else: - console.print(f"[red]Invalid method:[/red] {actual_method}") - console.print("[dim]Use 'auto' (default), 'dense_rerank' (semantic), or 'fts' (exact keyword)[/dim]") - raise typer.Exit(code=1) - - resolved_method = ( - _auto_select_search_method(query) - if actual_method == "auto" - else actual_method - ) - display_method = resolved_method - execution_method = resolved_method - - # Map dense_rerank to cascade method internally - internal_cascade_strategy = cascade_strategy - if execution_method == "dense_rerank": - execution_method = "cascade" - internal_cascade_strategy = "dense_rerank" - - # Validate cascade_strategy if provided (for advanced users) - if internal_cascade_strategy is not None: - valid_strategies = ["binary", "hybrid", "binary_rerank", "dense_rerank", "staged"] - if internal_cascade_strategy not in valid_strategies: - if json_mode: - print_json(success=False, error=f"Invalid cascade strategy: {internal_cascade_strategy}. Must be one of: {', '.join(valid_strategies)}") - else: - console.print(f"[red]Invalid cascade strategy:[/red] {internal_cascade_strategy}") - console.print(f"[dim]Valid strategies: {', '.join(valid_strategies)}[/dim]") - raise typer.Exit(code=1) - - # Parse custom weights if provided - hybrid_weights = None - if weights: - try: - # Check if using key=value format (new) or legacy comma-separated format - if "=" in weights: - # New format: exact=0.3,fuzzy=0.1,vector=0.6 - weight_dict = {} - for pair in weights.split(","): - if "=" in pair: - key, val = pair.split("=", 1) - weight_dict[key.strip()] = float(val.strip()) - else: - raise ValueError("Mixed format not supported - use all key=value pairs") - - # Validate and normalize weights - weight_sum = sum(weight_dict.values()) - if abs(weight_sum - 1.0) > 0.01: - if not json_mode: - console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]") - weight_dict = {k: v / weight_sum for k, v in weight_dict.items()} - - hybrid_weights = weight_dict - else: - # Legacy format: 0.3,0.1,0.6 (exact,fuzzy,vector) - weight_parts = [float(w.strip()) for w in weights.split(",")] - if len(weight_parts) == 3: - weight_sum = sum(weight_parts) - if abs(weight_sum - 1.0) > 0.01: - if not json_mode: - console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]") - weight_parts = [w / weight_sum for w in weight_parts] - hybrid_weights = { - "exact": weight_parts[0], - "fuzzy": weight_parts[1], - "vector": weight_parts[2], - } - else: - if not json_mode: - console.print("[yellow]Warning: Invalid weights format. Using defaults.[/yellow]") - except ValueError as e: - if not json_mode: - console.print(f"[yellow]Warning: Invalid weights format ({e}). Using defaults.[/yellow]") - - registry: RegistryStore | None = None - try: - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - engine = ChainSearchEngine(registry, mapper, config=config) - - # Optional staged cascade overrides (only meaningful for cascade strategy 'staged') - if staged_stage2_mode is not None: - stage2 = staged_stage2_mode.strip().lower() - if stage2 not in {"precomputed", "realtime", "static_global_graph"}: - msg = "Invalid --staged-stage2-mode. Must be: precomputed | realtime | static_global_graph." - if json_mode: - print_json(success=False, error=msg) - else: - console.print(f"[red]{msg}[/red]") - raise typer.Exit(code=1) - config.staged_stage2_mode = stage2 - - # Map method to SearchOptions flags - # fts: FTS-only search (optionally with fuzzy) - # vector: Pure vector semantic search - # hybrid: RRF fusion of sparse + dense - # cascade: Two-stage binary + dense retrieval - if execution_method == "fts": - hybrid_mode = False - enable_fuzzy = use_fuzzy - enable_vector = False - pure_vector = False - enable_cascade = False - elif execution_method == "vector": - hybrid_mode = True - enable_fuzzy = False - enable_vector = True - pure_vector = True - enable_cascade = False - elif execution_method == "hybrid": - hybrid_mode = True - enable_fuzzy = use_fuzzy - enable_vector = True - pure_vector = False - enable_cascade = False - elif execution_method == "cascade": - hybrid_mode = True - enable_fuzzy = False - enable_vector = True - pure_vector = False - enable_cascade = True - else: - raise ValueError(f"Invalid method: {execution_method}") - - # Parse exclude_extensions from comma-separated string - exclude_exts_list = None - if exclude_extensions: - exclude_exts_list = [ext.strip() for ext in exclude_extensions.split(',') if ext.strip()] - - options = SearchOptions( - depth=depth, - total_limit=limit, - offset=offset, - files_only=files_only, - code_only=code_only, - exclude_extensions=exclude_exts_list, - hybrid_mode=hybrid_mode, - enable_fuzzy=enable_fuzzy, - enable_vector=enable_vector, - pure_vector=pure_vector, - enable_cascade=enable_cascade, - hybrid_weights=hybrid_weights, - ) - - if files_only: - file_paths = engine.search_files_only(query, search_path, options) - payload = {"query": query, "count": len(file_paths), "files": file_paths} - if json_mode: - print_json(success=True, result=payload) - else: - for fp in file_paths: - console.print(fp) - else: - # Dispatch to cascade_search for cascade method - if execution_method == "cascade": - result = engine.cascade_search(query, search_path, k=limit, options=options, strategy=internal_cascade_strategy) - else: - result = engine.search(query, search_path, options) - effective_results = result.results - effective_files_matched = result.stats.files_matched - effective_time_ms = result.stats.time_ms - fallback_payload = None - if display_method == "fts" and not use_fuzzy and not effective_results: - fallback_payload = _filesystem_fallback_search( - query, - search_path, - limit=limit, - config=config, - code_only=code_only, - exclude_extensions=exclude_exts_list, - ) - if fallback_payload is not None: - effective_results = fallback_payload["results"] - effective_files_matched = len(effective_results) - effective_time_ms = result.stats.time_ms + float(fallback_payload["time_ms"]) - - results_list = [ - { - "path": r.path, - "score": r.score, - "excerpt": r.excerpt, - "content": r.content, # Full function/class body - "source": getattr(r, "search_source", None), - "symbol": getattr(r, "symbol", None), - } - for r in effective_results - ] - - payload = { - "query": query, - "method": display_method, - "count": len(results_list), - "results": results_list, - "stats": { - "dirs_searched": result.stats.dirs_searched, - "files_matched": effective_files_matched, - "time_ms": effective_time_ms, - }, - } - if fallback_payload is not None: - payload["fallback"] = fallback_payload["fallback"] - if json_mode: - print_json(success=True, result=payload) - else: - render_search_results(effective_results, verbose=verbose) - if fallback_payload is not None: - console.print("[yellow]No indexed matches found; showing filesystem fallback results (stale index suspected).[/yellow]") - console.print(f"[dim]Method: {display_method} | Searched {result.stats.dirs_searched} directories in {effective_time_ms:.1f}ms[/dim]") - - except SearchError as exc: - if json_mode: - print_json(success=False, error=f"Search error: {exc}") - else: - console.print(f"[red]Search failed (query):[/red] {exc}") - raise typer.Exit(code=1) - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Search failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Search failed:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if registry is not None: - registry.close() - - -@app.command() -def symbol( - name: str = typer.Argument(..., help="Symbol name to look up."), - path: Path = typer.Option(Path("."), "--path", "-p", help="Directory to search from."), - kind: Optional[str] = typer.Option( - None, - "--kind", - "-k", - help="Filter by kind (function|class|method).", - ), - limit: int = typer.Option(50, "--limit", "-n", min=1, max=500, help="Max symbols."), - depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited)."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Look up symbols by name and optional kind.""" - _configure_logging(verbose, json_mode) - search_path = path.expanduser().resolve() - - registry: RegistryStore | None = None - try: - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - engine = ChainSearchEngine(registry, mapper, config=config) - options = SearchOptions(depth=depth, total_limit=limit) - - syms = engine.search_symbols(name, search_path, kind=kind, options=options) - - payload = {"name": name, "kind": kind, "count": len(syms), "symbols": syms} - if json_mode: - print_json(success=True, result=payload) - else: - render_symbols(syms) - - except SearchError as exc: - if json_mode: - print_json(success=False, error=f"Search error: {exc}") - else: - console.print(f"[red]Symbol lookup failed (search):[/red] {exc}") - raise typer.Exit(code=1) - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Symbol lookup failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Symbol lookup failed:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if registry is not None: - registry.close() - - -@app.command() -def inspect( - file: Path = typer.Argument(..., exists=True, dir_okay=False, help="File to analyze."), - symbols: bool = typer.Option(False, "--symbols", help="Show discovered symbols (default)."), - no_symbols: bool = typer.Option(False, "--no-symbols", help="Hide discovered symbols."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Analyze a single file and display symbols.""" - _configure_logging(verbose, json_mode) - if symbols and no_symbols: - _fail_mutually_exclusive("--symbols", "--no-symbols", json_mode) - show_symbols = True if (symbols or not no_symbols) else False - config = Config.load() - factory = ParserFactory(config) - - file_path = file.expanduser().resolve() - try: - text = file_path.read_text(encoding="utf-8", errors="ignore") - language_id = config.language_for_path(file_path) or "unknown" - parser = factory.get_parser(language_id) - indexed = parser.parse(text, file_path) - payload = {"file": indexed, "content_lines": len(text.splitlines())} - if json_mode: - print_json(success=True, result=payload) - else: - if show_symbols: - render_file_inspect(indexed.path, indexed.language, indexed.symbols) - else: - render_status({"file": indexed.path, "language": indexed.language}) - except ParseError as exc: - if json_mode: - print_json(success=False, error=f"Parse error: {exc}") - else: - console.print(f"[red]Inspect failed (parse):[/red] {exc}") - raise typer.Exit(code=1) - except FileNotFoundError as exc: - if json_mode: - print_json(success=False, error=f"File not found: {exc}") - else: - console.print(f"[red]Inspect failed (file not found):[/red] {exc}") - raise typer.Exit(code=1) - except PermissionError as exc: - if json_mode: - print_json(success=False, error=f"Permission denied: {exc}") - else: - console.print(f"[red]Inspect failed (permission denied):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Inspect failed:[/red] {exc}") - raise typer.Exit(code=1) - - -@app.command() -def status( - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Show index status and configuration.""" - _configure_logging(verbose, json_mode) - - registry: RegistryStore | None = None - try: - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - # Get all projects - projects = registry.list_projects() - - # Calculate total stats - total_files = sum(p.total_files for p in projects) - total_dirs = sum(p.total_dirs for p in projects) - - # Get index root size - index_root = mapper.index_root - index_size = 0 - if index_root.exists(): - for f in index_root.rglob("*"): - if f.is_file(): - index_size += f.stat().st_size - - # Check schema version and enabled features - schema_version = None - has_dual_fts = False - if projects and index_root.exists(): - # Check first index database for features - index_files = list(index_root.rglob("_index.db")) - if index_files: - try: - with DirIndexStore(index_files[0]) as store: - with store._lock: - conn = store._get_connection() - schema_version = store._get_schema_version(conn) - # Check if dual FTS tables exist - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name IN ('files_fts_exact', 'files_fts_fuzzy')" - ) - fts_tables = [row[0] for row in cursor.fetchall()] - has_dual_fts = len(fts_tables) == 2 - except Exception: - pass - - # Check embeddings coverage - embeddings_info = None - has_vector_search = False - try: - from codexlens.cli.embedding_manager import get_embeddings_status - - if index_root.exists(): - embed_status = get_embeddings_status(index_root) - if embed_status["success"]: - embeddings_info = embed_status["result"] - # Enable vector search if coverage >= 50% - has_vector_search = embeddings_info["coverage_percent"] >= 50.0 - except ImportError: - # Embedding manager not available - pass - except Exception as e: - logging.debug(f"Failed to get embeddings status: {e}") - - stats = { - "index_root": str(index_root), - "registry_path": str(_get_registry_path()), - "projects_count": len(projects), - "total_files": total_files, - "total_dirs": total_dirs, - "index_size_bytes": index_size, - "index_size_mb": round(index_size / (1024 * 1024), 2), - "schema_version": schema_version, - "features": { - "exact_fts": True, # Always available - "fuzzy_fts": has_dual_fts, - "hybrid_search": has_dual_fts, - "vector_search": has_vector_search, - }, - } - - # Add embeddings info if available - if embeddings_info: - stats["embeddings"] = embeddings_info - - if json_mode: - print_json(success=True, result=stats) - else: - console.print("[bold]CodexLens Status[/bold]") - console.print(f" Index Root: {stats['index_root']}") - console.print(f" Registry: {stats['registry_path']}") - console.print(f" Projects: {stats['projects_count']}") - console.print(f" Total Files: {stats['total_files']}") - console.print(f" Total Directories: {stats['total_dirs']}") - console.print(f" Index Size: {stats['index_size_mb']} MB") - if schema_version: - console.print(f" Schema Version: {schema_version}") - console.print("\n[bold]Search Backends:[/bold]") - console.print(f" Exact FTS: ✓ (unicode61)") - if has_dual_fts: - console.print(f" Fuzzy FTS: ✓ (trigram)") - console.print(f" Hybrid Search: ✓ (RRF fusion)") - else: - console.print(f" Fuzzy FTS: ✗ (run 'migrate' to enable)") - console.print(f" Hybrid Search: ✗ (run 'migrate' to enable)") - - if has_vector_search: - console.print(f" Vector Search: ✓ (embeddings available)") - else: - console.print(f" Vector Search: ✗ (no embeddings or coverage < 50%)") - - # Display embeddings statistics if available - if embeddings_info: - console.print("\n[bold]Embeddings Coverage:[/bold]") - console.print(f" Total Indexes: {embeddings_info['total_indexes']}") - console.print(f" Total Files: {embeddings_info['total_files']}") - console.print(f" Files with Embeddings: {embeddings_info['files_with_embeddings']}") - console.print(f" Coverage: {embeddings_info['coverage_percent']:.1f}%") - console.print(f" Total Chunks: {embeddings_info['total_chunks']}") - - # Display model information if available - model_info = embeddings_info.get('model_info') - if model_info: - console.print("\n[bold]Embedding Model:[/bold]") - console.print(f" Backend: [cyan]{model_info.get('backend', 'unknown')}[/cyan]") - console.print(f" Model: [cyan]{model_info.get('model_profile', 'unknown')}[/cyan] ({model_info.get('model_name', '')})") - console.print(f" Dimensions: {model_info.get('embedding_dim', 'unknown')}") - if model_info.get('updated_at'): - console.print(f" Last Updated: {model_info['updated_at']}") - - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Status failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Status failed:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if registry is not None: - registry.close() - - -@app.command(name="lsp-status") -def lsp_status( - path: Path = typer.Option(Path("."), "--path", "-p", help="Workspace root for LSP probing."), - probe_file: Optional[Path] = typer.Option( - None, - "--probe-file", - help="Optional file path to probe (starts the matching language server and prints capabilities).", - ), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Show standalone LSP configuration and optionally probe a language server. - - This exercises the existing LSP server selection/startup path in StandaloneLspManager. - """ - _configure_logging(verbose, json_mode) - - import asyncio - import shutil - - from codexlens.lsp.standalone_manager import StandaloneLspManager - - workspace_root = path.expanduser().resolve() - probe_path = probe_file.expanduser().resolve() if probe_file is not None else None - - async def _run(): - manager = StandaloneLspManager(workspace_root=str(workspace_root)) - await manager.start() - - servers = [] - for language_id, cfg in sorted(manager._configs.items()): # type: ignore[attr-defined] - cmd0 = cfg.command[0] if cfg.command else None - servers.append( - { - "language_id": language_id, - "display_name": cfg.display_name, - "extensions": list(cfg.extensions), - "command": list(cfg.command), - "command_available": bool(shutil.which(cmd0)) if cmd0 else False, - } - ) - - probe = None - if probe_path is not None: - state = await manager._get_server(str(probe_path)) - if state is None: - probe = { - "file": str(probe_path), - "ok": False, - "error": "No language server configured/available for this file.", - } - else: - probe = { - "file": str(probe_path), - "ok": True, - "language_id": state.config.language_id, - "display_name": state.config.display_name, - "initialized": bool(state.initialized), - "capabilities": state.capabilities, - } - - await manager.stop() - return {"workspace_root": str(workspace_root), "servers": servers, "probe": probe} - - try: - payload = asyncio.run(_run()) - except Exception as exc: - if json_mode: - print_json(success=False, error=f"LSP status failed: {exc}") - else: - console.print(f"[red]LSP status failed:[/red] {exc}") - raise typer.Exit(code=1) - - if json_mode: - print_json(success=True, result=payload) - return - - console.print("[bold]CodexLens LSP Status[/bold]") - console.print(f" Workspace: {payload['workspace_root']}") - console.print("\n[bold]Configured Servers:[/bold]") - for s in payload["servers"]: - ok = "✓" if s["command_available"] else "✗" - console.print(f" {ok} {s['display_name']} ({s['language_id']}) -> {s['command'][0] if s['command'] else ''}") - console.print(f" Extensions: {', '.join(s['extensions'])}") - - if payload["probe"] is not None: - probe = payload["probe"] - console.print("\n[bold]Probe:[/bold]") - if not probe.get("ok"): - console.print(f" ✗ {probe.get('file')}") - console.print(f" {probe.get('error')}") - else: - console.print(f" ✓ {probe.get('file')}") - console.print(f" Server: {probe.get('display_name')} ({probe.get('language_id')})") - console.print(f" Initialized: {probe.get('initialized')}") - - -@app.command(name="reranker-status") -def reranker_status( - probe: bool = typer.Option( - False, - "--probe", - help="Send a small rerank request to validate connectivity and credentials.", - ), - provider: Optional[str] = typer.Option( - None, - "--provider", - help="Reranker provider: siliconflow | cohere | jina (default: from env, else siliconflow).", - ), - api_base: Optional[str] = typer.Option( - None, - "--api-base", - help="Override API base URL (e.g. https://api.siliconflow.cn or https://api.cohere.ai).", - ), - model: Optional[str] = typer.Option( - None, - "--model", - help="Override reranker model name (provider-specific).", - ), - query: str = typer.Option("ping", "--query", help="Probe query text (used with --probe)."), - document: str = typer.Option("pong", "--document", help="Probe document text (used with --probe)."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Show reranker configuration and optionally probe the API backend. - - This is the fastest way to confirm that "重排" can actually execute end-to-end. - """ - _configure_logging(verbose, json_mode) - - import time - - from codexlens.env_config import load_global_env - from codexlens.semantic.reranker.api_reranker import ( - APIReranker, - _normalize_api_base_for_endpoint, - ) - - env = load_global_env() - - def _env_get(key: str) -> Optional[str]: - return ( - os.environ.get(key) - or os.environ.get(f"CODEXLENS_{key}") - or env.get(key) - or env.get(f"CODEXLENS_{key}") - ) - - effective_provider = (provider or _env_get("RERANKER_PROVIDER") or "siliconflow").strip() - effective_api_base = (api_base or _env_get("RERANKER_API_BASE") or "").strip() or None - effective_model = (model or _env_get("RERANKER_MODEL") or "").strip() or None - - # Do not leak secrets; only report whether a key is configured. - key_present = bool((_env_get("RERANKER_API_KEY") or "").strip()) - - provider_key = effective_provider.strip().lower() - defaults = getattr(APIReranker, "_PROVIDER_DEFAULTS", {}).get(provider_key, {}) - endpoint = defaults.get("endpoint", "/v1/rerank") - configured_base = effective_api_base or defaults.get("api_base") or "" - normalized_base = _normalize_api_base_for_endpoint(api_base=configured_base, endpoint=endpoint) - - payload: Dict[str, Any] = { - "provider": effective_provider, - "api_base": effective_api_base, - "endpoint": endpoint, - "normalized_api_base": normalized_base or None, - "request_url": f"{normalized_base}{endpoint}" if normalized_base else None, - "model": effective_model, - "api_key_configured": key_present, - "probe": None, - } - - if probe: - t0 = time.perf_counter() - try: - reranker = APIReranker( - provider=effective_provider, - api_base=effective_api_base, - model_name=effective_model, - ) - try: - scores = reranker.score_pairs([(query, document)]) - finally: - reranker.close() - resolved_base = getattr(reranker, "api_base", None) - resolved_endpoint = getattr(reranker, "endpoint", None) - request_url = ( - f"{resolved_base}{resolved_endpoint}" - if resolved_base and resolved_endpoint - else None - ) - payload["probe"] = { - "ok": True, - "latency_ms": (time.perf_counter() - t0) * 1000.0, - "score": float(scores[0]) if scores else None, - "normalized_api_base": resolved_base, - "request_url": request_url, - } - except Exception as exc: - payload["probe"] = { - "ok": False, - "latency_ms": (time.perf_counter() - t0) * 1000.0, - "error": f"{type(exc).__name__}: {exc}", - } - - if json_mode: - print_json(success=True, result=payload) - return - - console.print("[bold]CodexLens Reranker Status[/bold]") - console.print(f" Provider: {payload['provider']}") - console.print(f" API Base: {payload['api_base'] or '(default)'}") - if payload.get("normalized_api_base"): - console.print(f" API Base (normalized): {payload['normalized_api_base']}") - console.print(f" Endpoint: {payload.get('endpoint')}") - if payload.get("request_url"): - console.print(f" Request URL: {payload['request_url']}") - console.print(f" Model: {payload['model'] or '(default)'}") - console.print(f" API Key: {'set' if key_present else 'missing'}") - - if payload["probe"] is not None: - probe_payload = payload["probe"] - console.print("\n[bold]Probe:[/bold]") - if probe_payload.get("ok"): - console.print(f" ✓ OK ({probe_payload.get('latency_ms'):.1f}ms)") - console.print(f" Score: {probe_payload.get('score')}") - else: - console.print(f" ✗ Failed ({probe_payload.get('latency_ms'):.1f}ms)") - console.print(f" {probe_payload.get('error')}") - - -@app.command() -def projects( - action: str = typer.Argument("list", help="Action: list, show, remove"), - project_path: Optional[Path] = typer.Argument(None, help="Project path (for show/remove)."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Manage registered projects in the global registry. - - Actions: - - list: Show all registered projects - - show : Show details for a specific project - - remove : Remove a project from the registry - """ - _configure_logging(verbose, json_mode) - - registry: RegistryStore | None = None - try: - registry = RegistryStore() - registry.initialize() - - if action == "list": - project_list = registry.list_projects() - if json_mode: - result = [ - { - "id": p.id, - "source_root": str(p.source_root), - "index_root": str(p.index_root), - "total_files": p.total_files, - "total_dirs": p.total_dirs, - "status": p.status, - } - for p in project_list - ] - print_json(success=True, result=result) - else: - if not project_list: - console.print("[yellow]No projects registered.[/yellow]") - else: - table = Table(title="Registered Projects") - table.add_column("ID", style="dim") - table.add_column("Source Root") - table.add_column("Files", justify="right") - table.add_column("Dirs", justify="right") - table.add_column("Status") - - for p in project_list: - table.add_row( - str(p.id), - str(p.source_root), - str(p.total_files), - str(p.total_dirs), - p.status, - ) - console.print(table) - - elif action == "show": - if not project_path: - raise typer.BadParameter("Project path required for 'show' action") - - project_path = project_path.expanduser().resolve() - project_info = registry.get_project(project_path) - - if not project_info: - if json_mode: - print_json(success=False, error=f"Project not found: {project_path}") - else: - console.print(f"[red]Project not found:[/red] {project_path}") - raise typer.Exit(code=1) - - if json_mode: - result = { - "id": project_info.id, - "source_root": str(project_info.source_root), - "index_root": str(project_info.index_root), - "total_files": project_info.total_files, - "total_dirs": project_info.total_dirs, - "status": project_info.status, - "created_at": project_info.created_at, - "last_indexed": project_info.last_indexed, - } - print_json(success=True, result=result) - else: - console.print(f"[bold]Project:[/bold] {project_info.source_root}") - console.print(f" ID: {project_info.id}") - console.print(f" Index Root: {project_info.index_root}") - console.print(f" Files: {project_info.total_files}") - console.print(f" Directories: {project_info.total_dirs}") - console.print(f" Status: {project_info.status}") - - # Show directory breakdown - dirs = registry.get_project_dirs(project_info.id) - if dirs: - console.print(f"\n [bold]Indexed Directories:[/bold] {len(dirs)}") - for d in dirs[:10]: - console.print(f" - {d.source_path.name}/ ({d.files_count} files)") - if len(dirs) > 10: - console.print(f" ... and {len(dirs) - 10} more") - - elif action == "remove": - if not project_path: - raise typer.BadParameter("Project path required for 'remove' action") - - project_path = project_path.expanduser().resolve() - removed = registry.unregister_project(project_path) - - if removed: - mapper = PathMapper() - index_root = mapper.source_to_index_dir(project_path) - if index_root.exists(): - _remove_tree_best_effort(index_root) - - if json_mode: - print_json(success=True, result={"removed": str(project_path)}) - else: - console.print(f"[green]Removed:[/green] {project_path}") - else: - if json_mode: - print_json(success=False, error=f"Project not found: {project_path}") - else: - console.print(f"[yellow]Project not found:[/yellow] {project_path}") - - else: - raise typer.BadParameter(f"Unknown action: {action}. Use list, show, or remove.") - - except typer.BadParameter: - raise - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Projects command failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except PermissionError as exc: - if json_mode: - print_json(success=False, error=f"Permission denied: {exc}") - else: - console.print(f"[red]Projects command failed (permission denied):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Projects command failed:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if registry is not None: - registry.close() - - -@app.command() -def config( - action: str = typer.Argument("show", help="Action: show, set, migrate"), - key: Optional[str] = typer.Argument(None, help="Config key (for set action)."), - value: Optional[str] = typer.Argument(None, help="Config value (for set action)."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Manage CodexLens configuration. - - Actions: - - show: Display current configuration - - set : Set configuration value - - migrate : Migrate indexes to new location - - Config keys: - - index_dir: Directory to store indexes (default: ~/.codexlens/indexes) - - reranker_backend: Reranker backend (onnx, api, litellm, legacy) - - reranker_model: Reranker model name - - reranker_enabled: Enable reranking (true/false) - - reranker_top_k: Number of results to rerank - - reranker_api_provider: API provider for reranker (siliconflow, cohere, jina) - - embedding_backend: Embedding backend (fastembed, litellm) - - embedding_model: Embedding model profile or name - """ - _configure_logging(verbose, json_mode) - - config_file = Path.home() / ".codexlens" / "config.json" - - def load_config() -> Dict[str, Any]: - if config_file.exists(): - return json.loads(config_file.read_text(encoding="utf-8")) - return {} - - def save_config(cfg: Dict[str, Any]) -> None: - config_file.parent.mkdir(parents=True, exist_ok=True) - config_file.write_text(json.dumps(cfg, indent=2), encoding="utf-8") - - try: - if action == "show": - cfg = load_config() - current_index_dir = os.getenv("CODEXLENS_INDEX_DIR") or cfg.get("index_dir") or str(Path.home() / ".codexlens" / "indexes") - - result = { - "config_file": str(config_file), - "index_dir": current_index_dir, - "env_override": os.getenv("CODEXLENS_INDEX_DIR"), - } - - # Load settings.json for reranker and other runtime settings - settings_file = Path.home() / ".codexlens" / "settings.json" - if settings_file.exists(): - try: - settings = json.loads(settings_file.read_text(encoding="utf-8")) - # Extract reranker settings (flat keys for CCW compatibility) - reranker = settings.get("reranker", {}) - if reranker.get("backend"): - result["reranker_backend"] = reranker["backend"] - if reranker.get("model"): - result["reranker_model"] = reranker["model"] - if reranker.get("enabled") is not None: - result["reranker_enabled"] = reranker["enabled"] - if reranker.get("top_k"): - result["reranker_top_k"] = reranker["top_k"] - if reranker.get("api_provider"): - result["reranker_api_provider"] = reranker["api_provider"] - # Extract embedding settings - embedding = settings.get("embedding", {}) - if embedding.get("backend"): - result["embedding_backend"] = embedding["backend"] - if embedding.get("model"): - result["embedding_model"] = embedding["model"] - if embedding.get("auto_embed_missing") is not None: - result["embedding_auto_embed_missing"] = embedding["auto_embed_missing"] - except (json.JSONDecodeError, OSError): - pass # Settings file not readable, continue with defaults - - # Load .env overrides from global ~/.codexlens/.env - env_overrides: Dict[str, str] = {} - try: - from codexlens.env_config import load_global_env - env_overrides = load_global_env() - except ImportError: - pass - - # Apply .env overrides (highest priority) and track them - if env_overrides.get("EMBEDDING_MODEL"): - result["embedding_model"] = env_overrides["EMBEDDING_MODEL"] - result["embedding_model_source"] = ".env" - if env_overrides.get("EMBEDDING_BACKEND"): - result["embedding_backend"] = env_overrides["EMBEDDING_BACKEND"] - result["embedding_backend_source"] = ".env" - auto_embed_missing_override = env_overrides.get("CODEXLENS_AUTO_EMBED_MISSING") or env_overrides.get("AUTO_EMBED_MISSING") - if auto_embed_missing_override: - result["embedding_auto_embed_missing"] = auto_embed_missing_override.lower() in ("true", "1", "yes", "on") - result["embedding_auto_embed_missing_source"] = ".env" - if env_overrides.get("RERANKER_MODEL"): - result["reranker_model"] = env_overrides["RERANKER_MODEL"] - result["reranker_model_source"] = ".env" - if env_overrides.get("RERANKER_BACKEND"): - result["reranker_backend"] = env_overrides["RERANKER_BACKEND"] - result["reranker_backend_source"] = ".env" - if env_overrides.get("RERANKER_ENABLED"): - result["reranker_enabled"] = env_overrides["RERANKER_ENABLED"].lower() in ("true", "1", "yes", "on") - result["reranker_enabled_source"] = ".env" - if env_overrides.get("RERANKER_PROVIDER") or os.getenv("RERANKER_PROVIDER"): - result["reranker_api_provider"] = env_overrides.get("RERANKER_PROVIDER") or os.getenv("RERANKER_PROVIDER") - - if json_mode: - print_json(success=True, result=result) - else: - console.print("[bold]CodexLens Configuration[/bold]") - console.print(f" Config File: {result['config_file']}") - console.print(f" Index Directory: {result['index_dir']}") - if result['env_override']: - console.print(f" [dim](Override via CODEXLENS_INDEX_DIR)[/dim]") - - # Show embedding settings - console.print(f"\n[bold]Embedding[/bold]") - backend = result.get('embedding_backend', 'fastembed') - backend_source = result.get('embedding_backend_source', 'settings.json') - console.print(f" Backend: {backend} [dim]({backend_source})[/dim]") - model = result.get('embedding_model', 'code') - model_source = result.get('embedding_model_source', 'settings.json') - console.print(f" Model: {model} [dim]({model_source})[/dim]") - auto_embed_missing = result.get("embedding_auto_embed_missing", True) - auto_embed_missing_source = result.get("embedding_auto_embed_missing_source", "settings.json") - console.print(f" Auto Embed Missing: {auto_embed_missing} [dim]({auto_embed_missing_source})[/dim]") - - # Show reranker settings - console.print(f"\n[bold]Reranker[/bold]") - backend = result.get('reranker_backend', 'fastembed') - backend_source = result.get('reranker_backend_source', 'settings.json') - console.print(f" Backend: {backend} [dim]({backend_source})[/dim]") - model = result.get('reranker_model', 'N/A') - model_source = result.get('reranker_model_source', 'settings.json') - console.print(f" Model: {model} [dim]({model_source})[/dim]") - enabled = result.get('reranker_enabled', False) - enabled_source = result.get('reranker_enabled_source', 'settings.json') - console.print(f" Enabled: {enabled} [dim]({enabled_source})[/dim]") - - elif action == "set": - if not key: - raise typer.BadParameter("Config key required for 'set' action") - if not value: - raise typer.BadParameter("Config value required for 'set' action") - - cfg = load_config() - - if key == "index_dir": - new_path = Path(value).expanduser().resolve() - cfg["index_dir"] = str(new_path) - save_config(cfg) - - if json_mode: - print_json(success=True, result={"key": key, "value": str(new_path)}) - else: - console.print(f"[green]Set {key}=[/green] {new_path}") - console.print("[yellow]Note: Existing indexes remain at old location. Use 'config migrate' to move them.[/yellow]") - - # Handle reranker and embedding settings (stored in settings.json) - elif key in ("reranker_backend", "reranker_model", "reranker_enabled", "reranker_top_k", - "embedding_backend", "embedding_model", "embedding_auto_embed_missing", "reranker_api_provider"): - settings_file = Path.home() / ".codexlens" / "settings.json" - settings_file.parent.mkdir(parents=True, exist_ok=True) - - # Load existing settings - settings: Dict[str, Any] = {} - if settings_file.exists(): - try: - settings = json.loads(settings_file.read_text(encoding="utf-8")) - except (json.JSONDecodeError, OSError): - pass - - # Ensure nested structures exist - if "reranker" not in settings: - settings["reranker"] = {} - if "embedding" not in settings: - settings["embedding"] = {} - - # Map flat keys to nested structure - if key == "reranker_backend": - settings["reranker"]["backend"] = value - elif key == "reranker_model": - settings["reranker"]["model"] = value - elif key == "reranker_enabled": - settings["reranker"]["enabled"] = value.lower() in ("true", "1", "yes") - elif key == "reranker_top_k": - settings["reranker"]["top_k"] = int(value) - elif key == "reranker_api_provider": - settings["reranker"]["api_provider"] = value - elif key == "embedding_backend": - settings["embedding"]["backend"] = value - elif key == "embedding_model": - settings["embedding"]["model"] = value - elif key == "embedding_auto_embed_missing": - settings["embedding"]["auto_embed_missing"] = value.lower() in ("true", "1", "yes", "on") - - # Save settings - settings_file.write_text(json.dumps(settings, indent=2), encoding="utf-8") - - if json_mode: - print_json(success=True, result={"key": key, "value": value}) - else: - console.print(f"[green]Set {key}=[/green] {value}") - else: - raise typer.BadParameter(f"Unknown config key: {key}") - - elif action == "migrate": - if not key: - raise typer.BadParameter("New path required for 'migrate' action") - - new_path = Path(key).expanduser().resolve() - mapper = PathMapper() - old_path = mapper.index_root - - if not old_path.exists(): - if json_mode: - print_json(success=False, error="No indexes to migrate") - else: - console.print("[yellow]No indexes to migrate.[/yellow]") - return - - # Create new directory - new_path.mkdir(parents=True, exist_ok=True) - - # Count items to migrate - items = list(old_path.iterdir()) - migrated = 0 - - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("{task.completed}/{task.total}"), - TimeElapsedColumn(), - console=console, - ) as progress: - task = progress.add_task("Migrating indexes", total=len(items)) - - for item in items: - dest = new_path / item.name - if item.is_dir(): - shutil.copytree(item, dest, dirs_exist_ok=True) - else: - shutil.copy2(item, dest) - migrated += 1 - progress.advance(task) - - # Update config - cfg = load_config() - cfg["index_dir"] = str(new_path) - save_config(cfg) - - # Update registry paths - registry = RegistryStore() - registry.initialize() - registry.update_index_paths(old_path, new_path) - registry.close() - - result = { - "migrated_from": str(old_path), - "migrated_to": str(new_path), - "items_migrated": migrated, - } - - if json_mode: - print_json(success=True, result=result) - else: - console.print(f"[green]Migrated {migrated} items to:[/green] {new_path}") - console.print("[dim]Old indexes can be manually deleted after verifying migration.[/dim]") - - else: - raise typer.BadParameter(f"Unknown action: {action}. Use show, set, or migrate.") - - except typer.BadParameter: - raise - except ConfigError as exc: - if json_mode: - print_json(success=False, error=f"Configuration error: {exc}") - else: - console.print(f"[red]Config command failed (config):[/red] {exc}") - raise typer.Exit(code=1) - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Config command failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except PermissionError as exc: - if json_mode: - print_json(success=False, error=f"Permission denied: {exc}") - else: - console.print(f"[red]Config command failed (permission denied):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Config command failed:[/red] {exc}") - raise typer.Exit(code=1) - - -@app.command() -def migrate( - path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to migrate."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Migrate project indexes to latest schema (Dual-FTS upgrade). - - Upgrades all _index.db files in the project to schema version 4, which includes: - - Dual FTS tables (exact + fuzzy) - - Encoding detection support - - Incremental indexing metadata - - This is a safe operation that preserves all existing data. - Progress is shown during migration. - """ - _configure_logging(verbose, json_mode) - base_path = path.expanduser().resolve() - - registry: RegistryStore | None = None - try: - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - # Find project - project_info = registry.get_project(base_path) - if not project_info: - raise CodexLensError(f"No index found for: {base_path}. Run 'codex-lens init' first.") - - index_dir = mapper.source_to_index_dir(base_path) - if not index_dir.exists(): - raise CodexLensError(f"Index directory not found: {index_dir}") - - # Find all _index.db files - index_files = list(index_dir.rglob("_index.db")) - - if not index_files: - if json_mode: - print_json(success=True, result={"message": "No indexes to migrate", "migrated": 0}) - else: - console.print("[yellow]No indexes found to migrate.[/yellow]") - return - - migrated_count = 0 - error_count = 0 - already_migrated = 0 - - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), - TextColumn("({task.completed}/{task.total})"), - TimeElapsedColumn(), - console=console, - ) as progress: - task = progress.add_task(f"Migrating {len(index_files)} indexes...", total=len(index_files)) - - for db_path in index_files: - try: - store = DirIndexStore(db_path) - - # Check current version - with store._lock: - conn = store._get_connection() - current_version = store._get_schema_version(conn) - - if current_version >= DirIndexStore.SCHEMA_VERSION: - already_migrated += 1 - if verbose: - progress.console.print(f"[dim]Already migrated: {db_path.parent.name}[/dim]") - elif current_version > 0: - # Apply migrations - store._apply_migrations(conn, current_version) - store._set_schema_version(conn, DirIndexStore.SCHEMA_VERSION) - conn.commit() - migrated_count += 1 - if verbose: - progress.console.print(f"[green]Migrated: {db_path.parent.name} (v{current_version} → v{DirIndexStore.SCHEMA_VERSION})[/green]") - else: - # New database, initialize directly - store.initialize() - migrated_count += 1 - - store.close() - - except Exception as e: - error_count += 1 - if verbose: - progress.console.print(f"[red]Error migrating {db_path}: {e}[/red]") - - progress.update(task, advance=1) - - result = { - "path": str(base_path), - "total_indexes": len(index_files), - "migrated": migrated_count, - "already_migrated": already_migrated, - "errors": error_count, - } - - if json_mode: - print_json(success=True, result=result) - else: - console.print(f"[green]Migration complete:[/green]") - console.print(f" Total indexes: {len(index_files)}") - console.print(f" Migrated: {migrated_count}") - console.print(f" Already up-to-date: {already_migrated}") - if error_count > 0: - console.print(f" [yellow]Errors: {error_count}[/yellow]") - - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Migration failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Migration failed:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if registry is not None: - registry.close() - - -@app.command() -def clean( - path: Optional[Path] = typer.Argument(None, help="Project path to clean (removes project index)."), - all_indexes: bool = typer.Option(False, "--all", "-a", help="Remove all indexes."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Remove CodexLens index data. - - Without arguments, shows current index size. - With path, removes that project's indexes. - With --all, removes all indexes (use with caution). - """ - _configure_logging(verbose, json_mode) - - try: - mapper = PathMapper() - index_root = mapper.index_root - - if all_indexes: - # Remove everything - if not index_root.exists(): - if json_mode: - print_json(success=True, result={"cleaned": None, "message": "No indexes to clean"}) - else: - console.print("[yellow]No indexes to clean.[/yellow]") - return - - # Calculate size before removal - total_size = 0 - for f in index_root.rglob("*"): - if f.is_file(): - total_size += f.stat().st_size - - # Remove registry first - registry_path = _get_registry_path() - if registry_path.exists(): - registry_path.unlink() - - # Remove all indexes - removal = _remove_tree_best_effort(index_root) - - result = { - "cleaned": str(index_root), - "size_freed_mb": round(total_size / (1024 * 1024), 2), - "partial": bool(removal["partial"]), - "locked_paths": removal["locked_paths"], - "remaining_path": removal["remaining_path"], - "errors": removal["errors"], - } - - if json_mode: - print_json(success=True, result=result) - else: - if result["partial"]: - console.print( - f"[yellow]Partially removed all indexes:[/yellow] {result['size_freed_mb']} MB freed" - ) - if result["locked_paths"]: - console.print( - f"[dim]Locked paths left behind: {len(result['locked_paths'])}[/dim]" - ) - else: - console.print(f"[green]Removed all indexes:[/green] {result['size_freed_mb']} MB freed") - - elif path: - # Remove specific project - project_path = path.expanduser().resolve() - project_index = mapper.source_to_index_dir(project_path) - - if not project_index.exists(): - if json_mode: - print_json(success=False, error=f"No index found for: {project_path}") - else: - console.print(f"[yellow]No index found for:[/yellow] {project_path}") - return - - # Calculate size - total_size = 0 - for f in project_index.rglob("*"): - if f.is_file(): - total_size += f.stat().st_size - - # Remove from registry - registry = RegistryStore() - registry.initialize() - registry.unregister_project(project_path) - registry.close() - - # Remove indexes - removal = _remove_tree_best_effort(project_index) - - result = { - "cleaned": str(project_path), - "index_path": str(project_index), - "size_freed_mb": round(total_size / (1024 * 1024), 2), - "partial": bool(removal["partial"]), - "locked_paths": removal["locked_paths"], - "remaining_path": removal["remaining_path"], - "errors": removal["errors"], - } - - if json_mode: - print_json(success=True, result=result) - else: - if result["partial"]: - console.print(f"[yellow]Partially removed indexes for:[/yellow] {project_path}") - if result["locked_paths"]: - console.print( - f"[dim]Locked paths left behind: {len(result['locked_paths'])}[/dim]" - ) - else: - console.print(f"[green]Removed indexes for:[/green] {project_path}") - console.print(f" Freed: {result['size_freed_mb']} MB") - - else: - # Show current status - if not index_root.exists(): - if json_mode: - print_json(success=True, result={"index_root": str(index_root), "exists": False}) - else: - console.print("[yellow]No indexes found.[/yellow]") - return - - total_size = 0 - for f in index_root.rglob("*"): - if f.is_file(): - total_size += f.stat().st_size - - registry = RegistryStore() - registry.initialize() - projects = registry.list_projects() - registry.close() - - result = { - "index_root": str(index_root), - "projects_count": len(projects), - "total_size_mb": round(total_size / (1024 * 1024), 2), - } - - if json_mode: - print_json(success=True, result=result) - else: - console.print("[bold]Index Status[/bold]") - console.print(f" Location: {result['index_root']}") - console.print(f" Projects: {result['projects_count']}") - console.print(f" Total Size: {result['total_size_mb']} MB") - console.print("\n[dim]Use 'clean ' to remove a specific project or 'clean --all' to remove everything.[/dim]") - - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Clean failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except PermissionError as exc: - if json_mode: - print_json(success=False, error=f"Permission denied: {exc}") - else: - console.print(f"[red]Clean failed (permission denied):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Clean failed:[/red] {exc}") - raise typer.Exit(code=1) - - -@app.command("semantic-list") -def semantic_list( - path: Path = typer.Option(Path("."), "--path", "-p", help="Project path to list metadata from."), - offset: int = typer.Option(0, "--offset", "-o", min=0, help="Number of records to skip."), - limit: int = typer.Option(50, "--limit", "-n", min=1, max=100, help="Maximum records to return."), - tool_filter: Optional[str] = typer.Option(None, "--tool", "-t", help="Filter by LLM tool (gemini/qwen)."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """List semantic metadata entries for indexed files. - - Shows files that have LLM-generated summaries and keywords. - Results are aggregated from all index databases in the project. - """ - _configure_logging(verbose, json_mode) - base_path = path.expanduser().resolve() - - registry: Optional[RegistryStore] = None - try: - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - project_info = registry.get_project(base_path) - if not project_info: - raise CodexLensError(f"No index found for: {base_path}. Run 'codex-lens init' first.") - - index_dir = Path(project_info.index_root) - if not index_dir.exists(): - raise CodexLensError(f"Index directory not found: {index_dir}") - - all_results: list = [] - total_count = 0 - - index_files = sorted(index_dir.rglob("_index.db")) - - for db_path in index_files: - try: - store = DirIndexStore(db_path) - store.initialize() - - results, count = store.list_semantic_metadata( - offset=0, - limit=1000, - llm_tool=tool_filter, - ) - - source_dir = mapper.index_to_source(db_path.parent) - for r in results: - r["source_dir"] = str(source_dir) - - all_results.extend(results) - total_count += count - - store.close() - except Exception as e: - if verbose: - console.print(f"[yellow]Warning: Error reading {db_path}: {e}[/yellow]") - - all_results.sort(key=lambda x: x["generated_at"], reverse=True) - paginated = all_results[offset : offset + limit] - - result = { - "path": str(base_path), - "total": total_count, - "offset": offset, - "limit": limit, - "count": len(paginated), - "entries": paginated, - } - - if json_mode: - print_json(success=True, result=result) - else: - if not paginated: - console.print("[yellow]No semantic metadata found.[/yellow]") - console.print("Run 'codex-lens enhance' to generate metadata for indexed files.") - else: - table = Table(title=f"Semantic Metadata ({total_count} total)") - table.add_column("File", style="cyan", max_width=40) - table.add_column("Language", style="dim") - table.add_column("Purpose", max_width=30) - table.add_column("Keywords", max_width=25) - table.add_column("Tool") - - for entry in paginated: - keywords_str = ", ".join(entry["keywords"][:3]) - if len(entry["keywords"]) > 3: - keywords_str += f" (+{len(entry['keywords']) - 3})" - - table.add_row( - entry["file_name"], - entry["language"] or "-", - (entry["purpose"] or "-")[:30], - keywords_str or "-", - entry["llm_tool"] or "-", - ) - - console.print(table) - - if total_count > len(paginated): - console.print( - f"[dim]Showing {offset + 1}-{offset + len(paginated)} of {total_count}. " - "Use --offset and --limit for pagination.[/dim]" - ) - - except StorageError as exc: - if json_mode: - print_json(success=False, error=f"Storage error: {exc}") - else: - console.print(f"[red]Semantic-list failed (storage):[/red] {exc}") - raise typer.Exit(code=1) - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Semantic-list failed:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if registry is not None: - registry.close() - - -# ==================== Model Management Commands ==================== - -@app.command(name="model-list") -def model_list( - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """List available embedding models and their installation status. - - Shows 4 model profiles (fast, code, multilingual, balanced) with: - - Installation status - - Model size and dimensions - - Use case recommendations - """ - try: - from codexlens.cli.model_manager import list_models - - result = list_models() - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - models = data["models"] - cache_dir = data["cache_dir"] - cache_exists = data["cache_exists"] - - console.print("[bold]Available Embedding Models:[/bold]") - console.print(f"Cache directory: [dim]{cache_dir}[/dim] {'(exists)' if cache_exists else '(not found)'}\n") - - table = Table(show_header=True, header_style="bold") - table.add_column("Profile", style="cyan") - table.add_column("Model Name", style="blue") - table.add_column("Dims", justify="right") - table.add_column("Size (MB)", justify="right") - table.add_column("Status", justify="center") - table.add_column("Use Case", style="dim") - - for model in models: - status_icon = "[green]✓[/green]" if model["installed"] else "[dim]—[/dim]" - size_display = ( - f"{model['actual_size_mb']:.1f}" if model["installed"] - else f"~{model['estimated_size_mb']}" - ) - table.add_row( - model["profile"], - model["model_name"], - str(model["dimensions"]), - size_display, - status_icon, - model["use_case"][:40] + "..." if len(model["use_case"]) > 40 else model["use_case"], - ) - - console.print(table) - console.print("\n[dim]Use 'codexlens model-download ' to download a model[/dim]") - - except ImportError: - if json_mode: - print_json(success=False, error="fastembed not installed. Install with: pip install codexlens[semantic]") - else: - console.print("[red]Error:[/red] fastembed not installed") - console.print("[yellow]Install with:[/yellow] pip install codexlens[semantic]") - raise typer.Exit(code=1) - - -@app.command(name="model-download") -def model_download( - profile: str = typer.Argument(..., help="Model profile to download (fast, code, multilingual, balanced)."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """Download an embedding model by profile name. - - Example: - codexlens model-download code # Download code-optimized model - """ - try: - from codexlens.cli.model_manager import download_model - - if not json_mode: - console.print(f"[bold]Downloading model:[/bold] {profile}") - console.print("[dim]This may take a few minutes depending on your internet connection...[/dim]\n") - - # Create progress callback for non-JSON mode - progress_callback = None if json_mode else lambda msg: console.print(f"[cyan]{msg}[/cyan]") - - result = download_model(profile, progress_callback=progress_callback) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - console.print(f"[green]✓[/green] Model downloaded successfully!") - console.print(f" Profile: {data['profile']}") - console.print(f" Model: {data['model_name']}") - console.print(f" Cache size: {data['cache_size_mb']:.1f} MB") - console.print(f" Location: [dim]{data['cache_path']}[/dim]") - - except ImportError: - if json_mode: - print_json(success=False, error="fastembed not installed. Install with: pip install codexlens[semantic]") - else: - console.print("[red]Error:[/red] fastembed not installed") - console.print("[yellow]Install with:[/yellow] pip install codexlens[semantic]") - raise typer.Exit(code=1) - - -@app.command(name="model-delete") -def model_delete( - profile: str = typer.Argument(..., help="Model profile to delete (fast, code, multilingual, balanced)."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """Delete a downloaded embedding model from cache. - - Example: - codexlens model-delete fast # Delete fast model - """ - from codexlens.cli.model_manager import delete_model - - if not json_mode: - console.print(f"[bold yellow]Deleting model:[/bold yellow] {profile}") - - result = delete_model(profile) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - console.print(f"[green]✓[/green] Model deleted successfully!") - console.print(f" Profile: {data['profile']}") - console.print(f" Model: {data['model_name']}") - console.print(f" Freed space: {data['deleted_size_mb']:.1f} MB") - - -@app.command(name="model-download-custom") -def model_download_custom( - model_name: str = typer.Argument(..., help="Full HuggingFace model name (e.g., BAAI/bge-small-en-v1.5)."), - model_type: str = typer.Option("embedding", "--type", help="Model type: embedding or reranker."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """Download a custom HuggingFace model by name. - - This allows downloading any fastembed-compatible model from HuggingFace. - - Example: - codexlens model-download-custom BAAI/bge-small-en-v1.5 - codexlens model-download-custom BAAI/bge-reranker-base --type reranker - """ - try: - from codexlens.cli.model_manager import download_custom_model - - if not json_mode: - console.print(f"[bold]Downloading custom model:[/bold] {model_name}") - console.print(f"[dim]Model type: {model_type}[/dim]") - console.print("[dim]This may take a few minutes depending on your internet connection...[/dim]\n") - - progress_callback = None if json_mode else lambda msg: console.print(f"[cyan]{msg}[/cyan]") - - result = download_custom_model(model_name, model_type=model_type, progress_callback=progress_callback) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - console.print(f"[green]✓[/green] Custom model downloaded successfully!") - console.print(f" Model: {data['model_name']}") - console.print(f" Type: {data['model_type']}") - console.print(f" Cache size: {data['cache_size_mb']:.1f} MB") - console.print(f" Location: [dim]{data['cache_path']}[/dim]") - - except ImportError: - if json_mode: - print_json(success=False, error="fastembed not installed. Install with: pip install codexlens[semantic]") - else: - console.print("[red]Error:[/red] fastembed not installed") - console.print("[yellow]Install with:[/yellow] pip install codexlens[semantic]") - raise typer.Exit(code=1) - - -@app.command(name="model-info") -def model_info( - profile: str = typer.Argument(..., help="Model profile to get info (fast, code, multilingual, balanced)."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """Get detailed information about a model profile. - - Example: - codexlens model-info code # Get code model details - """ - from codexlens.cli.model_manager import get_model_info - - result = get_model_info(profile) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - console.print(f"[bold]Model Profile:[/bold] {data['profile']}") - console.print(f" Model name: {data['model_name']}") - console.print(f" Dimensions: {data['dimensions']}") - console.print(f" Status: {'[green]Installed[/green]' if data['installed'] else '[dim]Not installed[/dim]'}") - if data['installed'] and data['actual_size_mb']: - console.print(f" Cache size: {data['actual_size_mb']:.1f} MB") - console.print(f" Location: [dim]{data['cache_path']}[/dim]") - else: - console.print(f" Estimated size: ~{data['estimated_size_mb']} MB") - console.print(f"\n Description: {data['description']}") - console.print(f" Use case: {data['use_case']}") - - -# ==================== Reranker Model Management Commands ==================== - - -@app.command(name="reranker-model-list") -def reranker_model_list( - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """List available reranker models and their installation status. - - Shows reranker model profiles with: - - Installation status - - Model size - - Use case recommendations - """ - try: - from codexlens.cli.model_manager import list_reranker_models - - result = list_reranker_models() - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - models = data["models"] - cache_dir = data["cache_dir"] - cache_exists = data["cache_exists"] - - console.print("[bold]Available Reranker Models:[/bold]") - console.print(f"Cache directory: [dim]{cache_dir}[/dim] {'(exists)' if cache_exists else '(not found)'}\n") - - table = Table(show_header=True, header_style="bold") - table.add_column("Profile", style="cyan") - table.add_column("Model", style="dim") - table.add_column("Size", justify="right") - table.add_column("Status") - table.add_column("Description") - - for m in models: - status = "[green]✓ Installed[/green]" if m["installed"] else "[dim]Not installed[/dim]" - size = f"{m['actual_size_mb']:.1f} MB" if m["installed"] and m["actual_size_mb"] else f"~{m['estimated_size_mb']} MB" - rec = " [yellow]★[/yellow]" if m.get("recommended") else "" - table.add_row(m["profile"] + rec, m["model_name"], size, status, m["description"]) - - console.print(table) - console.print("\n[yellow]★[/yellow] = Recommended") - - except ImportError: - if json_mode: - print_json(success=False, error="fastembed reranker not available. Install with: pip install fastembed>=0.4.0") - else: - console.print("[red]Error:[/red] fastembed reranker not available") - console.print("Install with: [cyan]pip install fastembed>=0.4.0[/cyan]") - raise typer.Exit(code=1) - - -@app.command(name="reranker-model-download") -def reranker_model_download( - profile: str = typer.Argument(..., help="Reranker model profile to download."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """Download a reranker model by profile name. - - Example: - codexlens reranker-model-download ms-marco-mini # Download default reranker - """ - try: - from codexlens.cli.model_manager import download_reranker_model - - if not json_mode: - console.print(f"[bold]Downloading reranker model:[/bold] {profile}") - console.print("[dim]This may take a few minutes depending on your internet connection...[/dim]\n") - - progress_callback = None if json_mode else lambda msg: console.print(f"[cyan]{msg}[/cyan]") - - result = download_reranker_model(profile, progress_callback=progress_callback) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - console.print(f"[green]✓[/green] Reranker model downloaded successfully!") - console.print(f" Profile: {data['profile']}") - console.print(f" Model: {data['model_name']}") - console.print(f" Cache size: {data['cache_size_mb']:.1f} MB") - console.print(f" Location: [dim]{data['cache_path']}[/dim]") - - except ImportError: - if json_mode: - print_json(success=False, error="fastembed reranker not available. Install with: pip install fastembed>=0.4.0") - else: - console.print("[red]Error:[/red] fastembed reranker not available") - console.print("Install with: [cyan]pip install fastembed>=0.4.0[/cyan]") - raise typer.Exit(code=1) - - -@app.command(name="reranker-model-delete") -def reranker_model_delete( - profile: str = typer.Argument(..., help="Reranker model profile to delete."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """Delete a downloaded reranker model from cache. - - Example: - codexlens reranker-model-delete ms-marco-mini # Delete reranker model - """ - from codexlens.cli.model_manager import delete_reranker_model - - if not json_mode: - console.print(f"[bold yellow]Deleting reranker model:[/bold yellow] {profile}") - - result = delete_reranker_model(profile) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - console.print(f"[green]✓[/green] Reranker model deleted successfully!") - console.print(f" Profile: {data['profile']}") - console.print(f" Model: {data['model_name']}") - console.print(f" Freed space: {data['deleted_size_mb']:.1f} MB") - - -@app.command(name="reranker-model-info") -def reranker_model_info( - profile: str = typer.Argument(..., help="Reranker model profile to get info."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """Get detailed information about a reranker model profile. - - Example: - codexlens reranker-model-info ms-marco-mini # Get reranker model details - """ - from codexlens.cli.model_manager import get_reranker_model_info - - result = get_reranker_model_info(profile) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - console.print(f"[bold]Reranker Model Profile:[/bold] {data['profile']}") - console.print(f" Model name: {data['model_name']}") - console.print(f" Status: {'[green]Installed[/green]' if data['installed'] else '[dim]Not installed[/dim]'}") - if data['installed'] and data['actual_size_mb']: - console.print(f" Cache size: {data['actual_size_mb']:.1f} MB") - console.print(f" Location: [dim]{data['cache_path']}[/dim]") - else: - console.print(f" Estimated size: ~{data['estimated_size_mb']} MB") - console.print(f" Recommended: {'[green]Yes[/green]' if data.get('recommended') else '[dim]No[/dim]'}") - console.print(f"\n Description: {data['description']}") - console.print(f" Use case: {data['use_case']}") - - -# ==================== Embedding Management Commands ==================== - -@app.command(name="embeddings-status", hidden=True, deprecated=True) -def embeddings_status( - path: Optional[Path] = typer.Argument( - None, - exists=True, - help="Path to specific _index.db file or directory containing indexes. If not specified, uses default index root.", - ), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """[Deprecated] Use 'codexlens index status' instead. - - Check embedding status for one or all indexes. - - Shows embedding statistics including: - - Number of chunks generated - - File coverage percentage - - Files missing embeddings - - Examples: - codexlens embeddings-status # Check all indexes - codexlens embeddings-status ~/.codexlens/indexes/project/_index.db # Check specific index - codexlens embeddings-status ~/projects/my-app # Check project (auto-finds index) - """ - _deprecated_command_warning("embeddings-status", "index status") - from codexlens.cli.embedding_manager import get_embedding_stats_summary, get_embeddings_status - - # Determine what to check - if path is None: - # Check all indexes in default root - index_root = _get_index_root() - result = get_embedding_stats_summary(index_root) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - total = data["total_indexes"] - with_emb = data["indexes_with_embeddings"] - total_chunks = data["total_chunks"] - - console.print(f"[bold]Embedding Status Summary[/bold]") - console.print(f"Index root: [dim]{index_root}[/dim]\n") - console.print(f"Total indexes: {total}") - console.print(f"Indexes with embeddings: [{'green' if with_emb > 0 else 'yellow'}]{with_emb}[/]/{total}") - console.print(f"Total chunks: {total_chunks:,}\n") - - if data["indexes"]: - table = Table(show_header=True, header_style="bold") - table.add_column("Project", style="cyan") - table.add_column("Files", justify="right") - table.add_column("Chunks", justify="right") - table.add_column("Coverage", justify="right") - table.add_column("Status", justify="center") - - for idx_stat in data["indexes"]: - status_icon = "[green]✓[/green]" if idx_stat["has_embeddings"] else "[dim]—[/dim]" - coverage = f"{idx_stat['coverage_percent']:.1f}%" if idx_stat["has_embeddings"] else "—" - - table.add_row( - idx_stat["project"], - str(idx_stat["total_files"]), - f"{idx_stat['total_chunks']:,}" if idx_stat["has_embeddings"] else "0", - coverage, - status_icon, - ) - - console.print(table) - - else: - # Check specific index or find index for project - target_path = path.expanduser().resolve() - - if target_path.is_file() and target_path.name == "_index.db": - # Direct index file - index_path = target_path - elif target_path.is_dir(): - # Try to find index for this project - registry = RegistryStore() - try: - registry.initialize() - mapper = PathMapper() - index_path = mapper.source_to_index_db(target_path) - - if not index_path.exists(): - console.print(f"[red]Error:[/red] No index found for {target_path}") - console.print("Run 'codexlens init' first to create an index") - raise typer.Exit(code=1) - finally: - registry.close() - else: - console.print(f"[red]Error:[/red] Path must be _index.db file or directory") - raise typer.Exit(code=1) - - result = check_index_embeddings(index_path) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - raise typer.Exit(code=1) - - data = result["result"] - has_emb = data["has_embeddings"] - - console.print(f"[bold]Embedding Status[/bold]") - console.print(f"Index: [dim]{data['index_path']}[/dim]\n") - - if has_emb: - console.print(f"[green]✓[/green] Embeddings available") - console.print(f" Total chunks: {data['total_chunks']:,}") - console.print(f" Total files: {data['total_files']:,}") - console.print(f" Files with embeddings: {data['files_with_chunks']:,}/{data['total_files']}") - console.print(f" Coverage: {data['coverage_percent']:.1f}%") - - if data["files_without_chunks"] > 0: - console.print(f"\n[yellow]Warning:[/yellow] {data['files_without_chunks']} files missing embeddings") - if data["missing_files_sample"]: - console.print(" Sample missing files:") - for file in data["missing_files_sample"]: - console.print(f" [dim]{file}[/dim]") - else: - console.print(f"[yellow]—[/yellow] No embeddings found") - console.print(f" Total files indexed: {data['total_files']:,}") - console.print("\n[dim]Generate embeddings with:[/dim]") - console.print(f" [cyan]codexlens embeddings-generate {index_path}[/cyan]") - - -@index_app.command("embeddings") -def index_embeddings( - path: Path = typer.Argument( - ..., - exists=True, - help="Path to _index.db file or project directory.", - ), - backend: str = typer.Option( - "fastembed", - "--backend", - "-b", - help="Embedding backend: fastembed (local) or litellm (remote API).", - ), - model: str = typer.Option( - "code", - "--model", - "-m", - help="Model: profile name for fastembed (fast/code/multilingual/balanced) or model name for litellm (e.g. text-embedding-3-small).", - ), - force: bool = typer.Option( - False, - "--force", - "-f", - help="Force regeneration even if embeddings exist.", - ), - chunk_size: int = typer.Option( - 2000, - "--chunk-size", - help="Maximum chunk size in characters.", - ), - max_workers: int = typer.Option( - 1, - "--max-workers", - "-w", - min=1, - help="Max concurrent API calls. Recommended: 4-8 for litellm backend. Default: 1 (sequential).", - ), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."), - centralized: bool = typer.Option( - False, - "--centralized", - "-c", - help="Use centralized vector storage (default).", - ), - distributed: bool = typer.Option( - False, - "--distributed", - "-d", - help="Use distributed per-directory indexes.", - ), -) -> None: - """Generate semantic embeddings for code search. - - Creates vector embeddings for all files in an index to enable - semantic search capabilities. Embeddings are stored in the same - database as the FTS index. - - Storage Modes: - - Default: Per-directory HNSW indexes alongside _index.db files - - Centralized: Single HNSW index at project root (_vectors.hnsw) - - Embedding Backend Options: - - fastembed: Local ONNX-based embeddings (default, no API calls) - - litellm: Remote API embeddings via ccw-litellm (requires API keys) - - Model Options: - For fastembed backend (profiles): - - fast: BAAI/bge-small-en-v1.5 (384 dims, ~80MB) - - code: jinaai/jina-embeddings-v2-base-code (768 dims, ~150MB) [recommended] - - multilingual: intfloat/multilingual-e5-large (1024 dims, ~1GB) - - balanced: mixedbread-ai/mxbai-embed-large-v1 (1024 dims, ~600MB) - - For litellm backend (model names): - - text-embedding-3-small, text-embedding-3-large (OpenAI) - - text-embedding-ada-002 (OpenAI legacy) - - Any model supported by ccw-litellm - - Examples: - codexlens index embeddings ~/projects/my-app # Auto-find index (fastembed, code profile) - codexlens index embeddings ~/.codexlens/indexes/project/_index.db # Specific index - codexlens index embeddings ~/projects/my-app --backend litellm --model text-embedding-3-small # Use LiteLLM - codexlens index embeddings ~/projects/my-app --model fast --force # Regenerate with fast profile - codexlens index embeddings ~/projects/my-app --centralized # Centralized vector storage - """ - _configure_logging(verbose, json_mode) - if centralized and distributed: - _fail_mutually_exclusive("--centralized", "--distributed", json_mode) - use_centralized = not distributed - - from codexlens.cli.embedding_manager import ( - generate_embeddings, - generate_dense_embeddings_centralized, - scan_for_model_conflicts, - check_global_model_lock, - set_locked_model_config, - ) - - # Validate backend - valid_backends = ["fastembed", "litellm"] - if backend not in valid_backends: - error_msg = f"Invalid backend: {backend}. Must be one of: {', '.join(valid_backends)}" - if json_mode: - print_json(success=False, error=error_msg) - else: - console.print(f"[red]Error:[/red] {error_msg}") - console.print(f"[dim]Valid backends: {', '.join(valid_backends)}[/dim]") - raise typer.Exit(code=1) - - # Resolve path - target_path = path.expanduser().resolve() - - # Determine index path or root for centralized mode - index_path = None - index_root = None - - if target_path.is_file() and target_path.name == "_index.db": - # Direct index file - index_path = target_path - index_root = target_path.parent - elif target_path.is_dir(): - # Directory: Find index location from registry - registry = RegistryStore() - try: - registry.initialize() - mapper = PathMapper() - index_path = mapper.source_to_index_db(target_path) - - if not index_path.exists(): - console.print(f"[red]Error:[/red] No index found for {target_path}") - console.print("Run 'codexlens init' first to create an index") - raise typer.Exit(code=1) - index_root = index_path.parent # Use index directory for both modes - finally: - registry.close() - else: - console.print(f"[red]Error:[/red] Path must be _index.db file or directory") - raise typer.Exit(code=1) - - # Progress callback - def progress_update(msg: str): - if not json_mode and verbose: - console.print(f" {msg}") - - console.print(f"[bold]Generating embeddings[/bold]") - if centralized: - effective_root = index_root if index_root else (index_path.parent if index_path else target_path) - console.print(f"Index root: [dim]{effective_root}[/dim]") - console.print(f"Mode: [green]Centralized[/green]") - else: - console.print(f"Index: [dim]{index_path}[/dim]") - console.print(f"Backend: [cyan]{backend}[/cyan]") - console.print(f"Model: [cyan]{model}[/cyan]") - if max_workers > 1: - console.print(f"Concurrency: [cyan]{max_workers} workers[/cyan]") - console.print() - - # Check global model lock (prevents mixing different models) - if not force: - lock_result = check_global_model_lock(backend, model) - if lock_result["has_conflict"]: - locked = lock_result["locked_config"] - if json_mode: - print_json( - success=False, - error="Global model lock conflict", - code="MODEL_LOCKED", - locked_config=locked, - target_config=lock_result["target_config"], - hint="Use --force to override the lock and switch to a different model (will regenerate all embeddings)", - ) - raise typer.Exit(code=1) - else: - console.print("[red]⛔ Global Model Lock Active[/red]") - console.print(f" Locked model: [cyan]{locked['backend']}/{locked['model']}[/cyan]") - console.print(f" Requested: [yellow]{backend}/{model}[/yellow]") - console.print(f" Locked at: {locked.get('locked_at', 'unknown')}") - console.print() - console.print("[dim]All indexes must use the same embedding model.[/dim]") - console.print("[dim]Use --force to switch models (will regenerate all embeddings).[/dim]") - raise typer.Exit(code=1) - - # Pre-check for model conflicts (only if not forcing) - if not force: - # Determine the index root for conflict scanning - scan_root = index_root if index_root else (index_path.parent if index_path else None) - - if scan_root: - conflict_result = scan_for_model_conflicts(scan_root, backend, model) - - if conflict_result["has_conflict"]: - existing = conflict_result["existing_config"] - conflict_count = len(conflict_result["conflicts"]) - - if json_mode: - # JSON mode: return structured error for UI handling - print_json( - success=False, - error="Model conflict detected", - code="MODEL_CONFLICT", - existing_config=existing, - target_config=conflict_result["target_config"], - conflict_count=conflict_count, - conflicts=conflict_result["conflicts"][:5], # Show first 5 conflicts - hint="Use --force to overwrite existing embeddings with the new model", - ) - raise typer.Exit(code=1) - else: - # Interactive mode: show warning and ask for confirmation - console.print("[yellow]⚠ Model Conflict Detected[/yellow]") - console.print(f" Existing: [red]{existing['backend']}/{existing['model']}[/red] ({existing.get('embedding_dim', '?')} dim)") - console.print(f" Requested: [green]{backend}/{model}[/green]") - console.print(f" Affected indexes: [yellow]{conflict_count}[/yellow]") - console.print() - console.print("[dim]Mixing different embedding models in the same index is not supported.[/dim]") - console.print("[dim]Overwriting will delete all existing embeddings and regenerate with the new model.[/dim]") - console.print() - - # Ask for confirmation - if typer.confirm("Overwrite existing embeddings with the new model?", default=False): - force = True - console.print("[green]Confirmed.[/green] Proceeding with overwrite...\n") - else: - console.print("[yellow]Cancelled.[/yellow] Use --force to skip this prompt.") - raise typer.Exit(code=0) - - if use_centralized: - # Centralized mode: single HNSW index at project root - if not index_root: - index_root = index_path.parent if index_path else target_path - result = generate_dense_embeddings_centralized( - index_root, - embedding_backend=backend, - model_profile=model, - force=force, - chunk_size=chunk_size, - progress_callback=progress_update, - max_workers=max_workers, - ) - else: - result = generate_embeddings( - index_path, - embedding_backend=backend, - model_profile=model, - force=force, - chunk_size=chunk_size, - progress_callback=progress_update, - max_workers=max_workers, - ) - - if json_mode: - print_json(**result) - else: - if not result["success"]: - error_msg = _extract_embedding_error(result) - console.print(f"[red]Error:[/red] {error_msg}") - - # Provide helpful hints - if "already has" in error_msg: - console.print("\n[dim]Use --force to regenerate existing embeddings[/dim]") - elif "fastembed not available" in error_msg or "Semantic search not available" in error_msg: - console.print("\n[dim]Install semantic dependencies:[/dim]") - console.print(" [cyan]pip install codexlens[semantic][/cyan]") - elif "ccw-litellm not available" in error_msg: - console.print("\n[dim]Install LiteLLM backend dependencies:[/dim]") - console.print(" [cyan]pip install ccw-litellm[/cyan]") - - raise typer.Exit(code=1) - - data = result["result"] - - # Set global model lock after successful generation - # This prevents using different models for future indexes - set_locked_model_config(backend, model) - - if centralized: - # Centralized mode output - elapsed = data.get("elapsed_time", 0) - console.print(f"[green]v[/green] Centralized embeddings generated successfully!") - console.print(f" Model: {data.get('model_name', model)}") - console.print(f" Chunks created: {data['chunks_created']:,}") - console.print(f" Files processed: {data['files_processed']}") - if data.get("files_failed", 0) > 0: - console.print(f" [yellow]Files failed: {data['files_failed']}[/yellow]") - console.print(f" Central index: {data.get('central_index_path', 'N/A')}") - console.print(f" Time: {elapsed:.1f}s") - else: - # Single index mode output - elapsed = data["elapsed_time"] - - console.print(f"[green]v[/green] Embeddings generated successfully!") - console.print(f" Model: {data['model_name']}") - console.print(f" Chunks created: {data['chunks_created']:,}") - console.print(f" Files processed: {data['files_processed']}") - - if data["files_failed"] > 0: - console.print(f" [yellow]Files failed: {data['files_failed']}[/yellow]") - if data["failed_files"]: - console.print(" [dim]First failures:[/dim]") - for file_path, error in data["failed_files"]: - console.print(f" [dim]{file_path}: {error}[/dim]") - - console.print(f" Time: {elapsed:.1f}s") - - console.print("\n[dim]Use vector search with:[/dim]") - console.print(" [cyan]codexlens search 'your query' --mode pure-vector[/cyan]") - - -# ==================== GPU Management Commands ==================== - -@app.command(name="gpu-list") -def gpu_list( - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """List available GPU devices for embedding acceleration. - - Shows all detected GPU devices with their capabilities and selection status. - Discrete GPUs (NVIDIA, AMD) are automatically preferred over integrated GPUs. - - Examples: - codexlens gpu-list # List all GPUs - codexlens gpu-list --json # JSON output for scripting - """ - from codexlens.semantic.gpu_support import get_gpu_devices, detect_gpu, get_selected_device_id - - gpu_info = detect_gpu() - devices = get_gpu_devices() - selected_id = get_selected_device_id() - - if json_mode: - print_json( - success=True, - result={ - "devices": devices, - "selected_device_id": selected_id, - "gpu_available": gpu_info.gpu_available, - "providers": gpu_info.onnx_providers, - } - ) - else: - if not devices: - console.print("[yellow]No GPU devices detected[/yellow]") - console.print(f"ONNX Providers: [dim]{', '.join(gpu_info.onnx_providers)}[/dim]") - return - - console.print("[bold]Available GPU Devices[/bold]\n") - - table = Table(show_header=True, header_style="bold") - table.add_column("ID", justify="center") - table.add_column("Name") - table.add_column("Vendor", justify="center") - table.add_column("Type", justify="center") - table.add_column("Status", justify="center") - - for dev in devices: - type_str = "[green]Discrete[/green]" if dev["is_discrete"] else "[dim]Integrated[/dim]" - vendor_color = { - "nvidia": "green", - "amd": "red", - "intel": "blue" - }.get(dev["vendor"], "white") - vendor_str = f"[{vendor_color}]{dev['vendor'].upper()}[/{vendor_color}]" - - status_parts = [] - if dev["is_preferred"]: - status_parts.append("[cyan]Auto[/cyan]") - if dev["is_selected"]: - status_parts.append("[green]✓ Selected[/green]") - - status_str = " ".join(status_parts) if status_parts else "[dim]—[/dim]" - - table.add_row( - str(dev["device_id"]), - dev["name"], - vendor_str, - type_str, - status_str, - ) - - console.print(table) - console.print(f"\nONNX Providers: [dim]{', '.join(gpu_info.onnx_providers)}[/dim]") - console.print("\n[dim]Select GPU with:[/dim]") - console.print(" [cyan]codexlens gpu-select [/cyan]") - - -@app.command(name="gpu-select") -def gpu_select( - device_id: int = typer.Argument( - ..., - help="GPU device ID to use for embeddings. Use 'codexlens gpu-list' to see available IDs.", - ), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """Select a specific GPU device for embedding generation. - - By default, CodexLens automatically selects the most powerful GPU (discrete over integrated). - Use this command to override the selection. - - Examples: - codexlens gpu-select 1 # Use GPU device 1 - codexlens gpu-select 0 --json # Select GPU 0 with JSON output - """ - from codexlens.semantic.gpu_support import set_selected_device_id, get_gpu_devices - from codexlens.semantic.embedder import clear_embedder_cache - - devices = get_gpu_devices() - valid_ids = [dev["device_id"] for dev in devices] - - if device_id not in valid_ids: - if json_mode: - print_json(success=False, error=f"Invalid device_id {device_id}. Valid IDs: {valid_ids}") - else: - console.print(f"[red]Error:[/red] Invalid device_id {device_id}") - console.print(f"Valid IDs: {valid_ids}") - console.print("\n[dim]Use 'codexlens gpu-list' to see available devices[/dim]") - raise typer.Exit(code=1) - - success = set_selected_device_id(device_id) - - if success: - # Clear embedder cache to force reload with new GPU - clear_embedder_cache() - - device_name = next((dev["name"] for dev in devices if dev["device_id"] == device_id), "Unknown") - - if json_mode: - print_json( - success=True, - result={ - "device_id": device_id, - "device_name": device_name, - "message": f"GPU selection set to device {device_id}: {device_name}", - } - ) - else: - console.print(f"[green]✓[/green] GPU selection updated") - console.print(f" Device ID: {device_id}") - console.print(f" Device: [cyan]{device_name}[/cyan]") - console.print("\n[dim]New embeddings will use this GPU[/dim]") - else: - if json_mode: - print_json(success=False, error="Failed to set GPU selection") - else: - console.print("[red]Error:[/red] Failed to set GPU selection") - raise typer.Exit(code=1) - - -@app.command(name="gpu-reset") -def gpu_reset( - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), -) -> None: - """Reset GPU selection to automatic detection. - - Clears any manual GPU selection and returns to automatic selection - (discrete GPU preferred over integrated). - - Examples: - codexlens gpu-reset # Reset to auto-detection - """ - from codexlens.semantic.gpu_support import set_selected_device_id, detect_gpu - from codexlens.semantic.embedder import clear_embedder_cache - - set_selected_device_id(None) - clear_embedder_cache() - - gpu_info = detect_gpu(force_refresh=True) - - if json_mode: - print_json( - success=True, - result={ - "message": "GPU selection reset to auto-detection", - "preferred_device_id": gpu_info.preferred_device_id, - "preferred_device_name": gpu_info.gpu_name, - } - ) - else: - console.print("[green]✓[/green] GPU selection reset to auto-detection") - if gpu_info.preferred_device_id is not None: - console.print(f" Auto-selected device: {gpu_info.preferred_device_id}") - console.print(f" Device: [cyan]{gpu_info.gpu_name}[/cyan]") - - - - - - - -# ==================== Watch Command ==================== - -@app.command() -def watch( - path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to watch."), - language: Optional[List[str]] = typer.Option(None, "--language", "-l", help="Languages to watch (comma-separated)."), - debounce: int = typer.Option(1000, "--debounce", "-d", min=100, max=10000, help="Debounce interval in milliseconds."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Watch a directory for file changes and incrementally update the index. - - Monitors the specified directory for file system changes (create, modify, delete) - and automatically updates the CodexLens index. The directory must already be indexed - using 'codexlens init' before watching. - - Examples: - # Watch current directory - codexlens watch . - - # Watch with custom debounce interval - codexlens watch . --debounce 2000 - - # Watch only Python and JavaScript files - codexlens watch . --language python,javascript - - Press Ctrl+C to stop watching. - """ - _configure_logging(verbose) - watch_path = path.expanduser().resolve() - - registry: RegistryStore | None = None - try: - # Validate that path is indexed - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - project_record = registry.find_by_source_path(str(watch_path)) - if not project_record: - console.print(f"[red]Error:[/red] Directory is not indexed: {watch_path}") - console.print("[dim]Run 'codexlens init' first to create an index.[/dim]") - raise typer.Exit(code=1) - - # Parse languages - languages = _parse_languages(language) - - # Create watcher config - watcher_config = WatcherConfig( - debounce_ms=debounce, - languages=languages, - ) - - # Display startup message - console.print(f"[green]Starting watcher for:[/green] {watch_path}") - console.print(f"[dim]Debounce interval: {debounce}ms[/dim]") - if languages: - console.print(f"[dim]Watching languages: {', '.join(languages)}[/dim]") - console.print("[dim]Press Ctrl+C to stop[/dim]\n") - - # Create and start watcher manager - watch_config = Config.load() - manager = WatcherManager( - root_path=watch_path, - config=watch_config, - watcher_config=watcher_config, - on_indexed=lambda result: _display_index_result(result), - ) - - manager.start() - manager.wait() - - except KeyboardInterrupt: - console.print("\n[yellow]Stopping watcher...[/yellow]") - except CodexLensError as exc: - console.print(f"[red]Watch failed:[/red] {exc}") - raise typer.Exit(code=1) - except Exception as exc: - console.print(f"[red]Unexpected error:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if registry is not None: - registry.close() - - -def _display_index_result(result) -> None: - """Display indexing result in real-time.""" - if result.files_indexed > 0 or result.files_removed > 0: - parts = [] - if result.files_indexed > 0: - parts.append(f"[green]✓ Indexed {result.files_indexed} file(s)[/green]") - if result.files_removed > 0: - parts.append(f"[yellow]✗ Removed {result.files_removed} file(s)[/yellow]") - console.print(" | ".join(parts)) - - if result.errors: - for error in result.errors[:3]: # Show max 3 errors - console.print(f" [red]Error:[/red] {error}") - if len(result.errors) > 3: - console.print(f" [dim]... and {len(result.errors) - 3} more errors[/dim]") - - - -# ==================== Cascade Index Commands ==================== - - -def get_binary_index_path(db_path: Path) -> Path: - """Get the path for binary ANN index file. - - Args: - db_path: Path to the _index.db file - - Returns: - Path to the binary index file (_index_binary.bin) - """ - return db_path.parent / f"{db_path.stem}_binary.bin" - - -@index_app.command("binary") -def index_binary( - path: Annotated[Path, typer.Argument(help="Directory to index")], - force: Annotated[bool, typer.Option("--force", "-f", help="Force regenerate")] = False, - batch_size: Annotated[int, typer.Option("--batch-size", "-b", help="Batch size for embedding")] = 32, - json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False, - verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")] = False, -) -> None: - """Generate cascade embeddings (binary + dense) for two-stage retrieval. - - Cascade retrieval uses a two-stage approach: - 1. Binary search (fast, 32 bytes/vector) -> coarse filtering - 2. Dense rerank (precise, 8KB/vector) -> final results - - This command: - - Finds all _index.db files in the directory - - Generates binary (256-dim) and dense (2048-dim) embeddings for each chunk - - Stores embeddings in the database (embedding_binary, embedding_dense columns) - - Creates a BinaryANNIndex file for fast coarse retrieval - - Examples: - codexlens index binary ~/projects/my-app - codexlens index binary . --force - codexlens index binary . --batch-size 64 --verbose - """ - _configure_logging(verbose, json_mode) - - target_path = path.expanduser().resolve() - - # Find index database(s) - if target_path.is_file() and target_path.name == "_index.db": - index_dbs = [target_path] - elif target_path.is_dir(): - # Check local .codexlens/_index.db first - local_index = target_path / ".codexlens" / "_index.db" - if local_index.exists(): - index_dbs = [local_index] - else: - # Find via registry - registry = RegistryStore() - try: - registry.initialize() - mapper = PathMapper() - index_db = mapper.source_to_index_db(target_path) - if not index_db.exists(): - if json_mode: - print_json(success=False, error=f"No index found for {target_path}") - else: - console.print(f"[red]Error:[/red] No index found for {target_path}") - console.print("Run 'codexlens init' first to create an index") - raise typer.Exit(code=1) - # Find all _index.db files under the index root - index_root = index_db.parent - index_dbs = list(index_root.rglob("_index.db")) - finally: - registry.close() - else: - if json_mode: - print_json(success=False, error="Path must be _index.db file or indexed directory") - else: - console.print("[red]Error:[/red] Path must be _index.db file or indexed directory") - raise typer.Exit(code=1) - - if not index_dbs: - if json_mode: - print_json(success=False, error="No index databases found") - else: - console.print("[yellow]No index databases found[/yellow]") - raise typer.Exit(code=1) - - # Import cascade embedding backend - try: - from codexlens.indexing.embedding import CascadeEmbeddingBackend - from codexlens.semantic.ann_index import BinaryANNIndex - from codexlens.indexing.embedding import pack_binary_embedding - except ImportError as e: - error_msg = f"Cascade embedding dependencies not available: {e}" - if json_mode: - print_json(success=False, error=error_msg) - else: - console.print(f"[red]Error:[/red] {error_msg}") - console.print("[dim]Install with: pip install codexlens[semantic][/dim]") - raise typer.Exit(code=1) - - if not json_mode: - console.print(f"[bold]Generating cascade embeddings[/bold]") - console.print(f"Path: [dim]{target_path}[/dim]") - console.print(f"Index databases: [cyan]{len(index_dbs)}[/cyan]") - console.print(f"Batch size: [cyan]{batch_size}[/cyan]") - console.print() - - # Initialize cascade embedding backend - try: - cascade_backend = CascadeEmbeddingBackend() - except Exception as e: - error_msg = f"Failed to initialize cascade embedding backend: {e}" - if json_mode: - print_json(success=False, error=error_msg) - else: - console.print(f"[red]Error:[/red] {error_msg}") - raise typer.Exit(code=1) - - # Process statistics - total_chunks_processed = 0 - total_indexes_processed = 0 - total_indexes_successful = 0 - total_binary_indexes_created = 0 - errors_list: List[str] = [] - - # Process each index database - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), - TextColumn("({task.completed}/{task.total})"), - TimeElapsedColumn(), - console=console, - disable=json_mode, - ) as progress: - db_task = progress.add_task("Processing indexes...", total=len(index_dbs)) - - for db_path in index_dbs: - total_indexes_processed += 1 - index_name = db_path.parent.name - - try: - # Open the index store - store = DirIndexStore(db_path) - store.initialize() - - # Get connection for direct queries - conn = store._get_connection() - - # Ensure cascade columns exist in semantic_chunks table - try: - conn.execute("ALTER TABLE semantic_chunks ADD COLUMN embedding_binary BLOB") - except Exception: - pass # Column already exists - try: - conn.execute("ALTER TABLE semantic_chunks ADD COLUMN embedding_dense BLOB") - except Exception: - pass # Column already exists - conn.commit() - - # Check if semantic_chunks table exists and has data - try: - cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks") - chunk_count = cursor.fetchone()[0] - except Exception: - # semantic_chunks table doesn't exist or is empty - chunk_count = 0 - - if chunk_count == 0: - if verbose and not json_mode: - console.print(f" [dim]Skipping {index_name}: no chunks found[/dim]") - progress.advance(db_task) - store.close() - continue - - # Check if embeddings already exist (unless force) - if not force: - cursor = conn.execute( - "SELECT COUNT(*) FROM semantic_chunks WHERE embedding_binary IS NOT NULL" - ) - existing_count = cursor.fetchone()[0] - if existing_count > 0: - if verbose and not json_mode: - console.print(f" [dim]Skipping {index_name}: embeddings exist (use --force to regenerate)[/dim]") - progress.advance(db_task) - store.close() - continue - - # If force, clear existing cascade embeddings - if force: - conn.execute( - "UPDATE semantic_chunks SET embedding_binary = NULL, embedding_dense = NULL" - ) - conn.commit() - - # Get all chunks - cursor = conn.execute("SELECT id, content FROM semantic_chunks") - chunks = cursor.fetchall() - - if not chunks: - progress.advance(db_task) - store.close() - continue - - if verbose and not json_mode: - console.print(f" Processing {index_name}: {len(chunks)} chunks") - - # Process in batches - chunk_task = progress.add_task( - f" {index_name}", total=len(chunks) - ) - - # Prepare for BinaryANNIndex - binary_index_path = get_binary_index_path(db_path) - binary_ann_index = BinaryANNIndex(db_path, dim=256) - - for i in range(0, len(chunks), batch_size): - batch_chunks = chunks[i:i + batch_size] - batch_ids = [c[0] for c in batch_chunks] - batch_contents = [c[1] for c in batch_chunks] - - # Generate cascade embeddings - binary_embeddings, dense_embeddings = cascade_backend.encode_cascade( - batch_contents, batch_size=batch_size - ) - - # Pack binary embeddings and convert dense to bytes - packed_binaries = [] - dense_bytes_list = [] - - for j in range(len(batch_ids)): - # Pack binary embedding (256 bits -> 32 bytes) - packed_binary = pack_binary_embedding(binary_embeddings[j]) - packed_binaries.append(packed_binary) - - # Convert dense embedding to bytes - import numpy as np - dense_blob = dense_embeddings[j].astype(np.float32).tobytes() - dense_bytes_list.append(dense_blob) - - # Update database - for j, chunk_id in enumerate(batch_ids): - conn.execute( - """ - UPDATE semantic_chunks - SET embedding_binary = ?, embedding_dense = ? - WHERE id = ? - """, - (packed_binaries[j], dense_bytes_list[j], chunk_id) - ) - - # Add to binary ANN index - binary_ann_index.add_vectors(batch_ids, packed_binaries) - - conn.commit() - total_chunks_processed += len(batch_ids) - progress.advance(chunk_task, len(batch_ids)) - - # Save binary ANN index - binary_ann_index.save() - total_binary_indexes_created += 1 - - progress.remove_task(chunk_task) - store.close() - total_indexes_successful += 1 - - except Exception as e: - error_msg = f"{index_name}: {e}" - errors_list.append(error_msg) - if verbose and not json_mode: - console.print(f" [red]Error processing {index_name}:[/red] {e}") - - progress.advance(db_task) - - # Build result - result = { - "path": str(target_path), - "indexes_processed": total_indexes_processed, - "indexes_successful": total_indexes_successful, - "chunks_processed": total_chunks_processed, - "binary_indexes_created": total_binary_indexes_created, - "errors": len(errors_list), - "error_details": errors_list[:5] if errors_list else [], - } - - if json_mode: - print_json(success=True, result=result) - else: - console.print(f"\n[green]Cascade indexing complete[/green]") - console.print(f" Indexes processed: {total_indexes_processed}") - console.print(f" Indexes successful: {total_indexes_successful}") - console.print(f" Chunks processed: {total_chunks_processed:,}") - console.print(f" Binary indexes created: {total_binary_indexes_created}") - if errors_list: - console.print(f" [yellow]Errors: {len(errors_list)}[/yellow]") - for err in errors_list[:3]: - console.print(f" [dim]{err}[/dim]") - if len(errors_list) > 3: - console.print(f" [dim]... and {len(errors_list) - 3} more[/dim]") - - -@index_app.command("binary-mmap") -def index_binary_mmap( - path: Annotated[Path, typer.Argument(help="Project directory (indexed) or _index.db file")], - force: Annotated[bool, typer.Option("--force", "-f", help="Force rebuild binary mmap + metadata")] = False, - embedding_dim: Annotated[Optional[int], typer.Option("--embedding-dim", help="Only use embeddings with this dimension (e.g. 768)")] = None, - json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False, - verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")] = False, -) -> None: - """Build centralized `_binary_vectors.mmap` from existing embeddings (no model calls). - - This command enables the staged binary coarse search without regenerating - embeddings and without triggering global model locks. It: - - scans distributed semantic_chunks.embedding blobs under the index root - - assigns global chunk_ids - - writes `/_binary_vectors.mmap` (+ `.meta.json`) - - writes `/_vectors_meta.db` (chunk_metadata + binary_vectors) - """ - _configure_logging(verbose, json_mode) - - from codexlens.cli.embedding_manager import build_centralized_binary_vectors_from_existing - - target_path = path.expanduser().resolve() - - # Resolve index_root similar to other index commands. - if target_path.is_file() and target_path.name == "_index.db": - index_root = target_path.parent - else: - registry = RegistryStore() - try: - registry.initialize() - mapper = PathMapper() - index_db = mapper.source_to_index_db(target_path) - if not index_db.exists(): - msg = f"No index found for {target_path}" - if json_mode: - print_json(success=False, error=msg) - else: - console.print(f"[red]Error:[/red] {msg}") - console.print("Run `codexlens index init` first to create an index.") - raise typer.Exit(code=1) - index_root = index_db.parent - finally: - registry.close() - - def progress_update(message: str) -> None: - if json_mode: - return - console.print(f"[dim]{message}[/dim]") - - result = build_centralized_binary_vectors_from_existing( - index_root, - force=force, - embedding_dim=embedding_dim, - progress_callback=progress_update, - ) - - if json_mode: - print_json(**result) - return - - if not result.get("success"): - console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}") - hint = result.get("hint") - if hint: - console.print(f"[dim]{hint}[/dim]") - raise typer.Exit(code=1) - - data = result.get("result", {}) - console.print("\n[green]Binary mmap build complete[/green]") - console.print(f" Index root: {data.get('index_root')}") - console.print(f" Chunks written: {data.get('chunks_written'):,}") - console.print(f" Binary mmap: {data.get('binary_mmap')}") - console.print(f" Meta DB: {data.get('vectors_meta_db')}") - - -# ==================== Index Status Command ==================== - -@index_app.command("status") -def index_status( - path: Optional[Path] = typer.Argument( - None, - help="Path to project directory or _index.db file. If not specified, uses default index root.", - ), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."), -) -> None: - """Show comprehensive index status (embeddings). - - Shows combined status for all index types: - - Dense vector embeddings (HNSW) - - Binary cascade embeddings - - Examples: - codexlens index status # Check all indexes - codexlens index status ~/projects/my-app # Check specific project - codexlens index status --json # JSON output - """ - _configure_logging(verbose, json_mode) - - from codexlens.cli.embedding_manager import get_embedding_stats_summary, get_embeddings_status - - # Determine target path and index root - if path is None: - index_root = _get_index_root() - target_path = None - else: - target_path = path.resolve() - if target_path.is_file() and target_path.name == "_index.db": - index_root = target_path.parent - elif target_path.is_dir(): - # Try to find index for this project - registry = RegistryStore() - try: - registry.initialize() - mapper = PathMapper() - index_path = mapper.source_to_index_db(target_path) - if index_path.exists(): - index_root = index_path.parent - else: - if json_mode: - print_json(success=False, error=f"No index found for {target_path}") - else: - console.print(f"[red]Error:[/red] No index found for {target_path}") - console.print("Run 'codexlens index init' first to create an index") - raise typer.Exit(code=1) - finally: - registry.close() - else: - if json_mode: - print_json(success=False, error="Path must be _index.db file or directory") - else: - console.print(f"[red]Error:[/red] Path must be _index.db file or directory") - raise typer.Exit(code=1) - - # Get embeddings status - embeddings_result = get_embeddings_status(index_root) - embeddings_summary_result = get_embedding_stats_summary(index_root) - - # Build combined result - result = { - "index_root": str(index_root), - # Keep "embeddings" backward-compatible as the subtree summary payload. - "embeddings": embeddings_summary_result.get("result") if embeddings_summary_result.get("success") else None, - "embeddings_error": embeddings_summary_result.get("error") if not embeddings_summary_result.get("success") else None, - "embeddings_status": embeddings_result.get("result") if embeddings_result.get("success") else None, - "embeddings_status_error": embeddings_result.get("error") if not embeddings_result.get("success") else None, - "embeddings_summary": embeddings_summary_result.get("result") if embeddings_summary_result.get("success") else None, - "embeddings_summary_error": embeddings_summary_result.get("error") if not embeddings_summary_result.get("success") else None, - } - - if json_mode: - print_json(success=True, result=result) - else: - console.print(f"[bold]Index Status[/bold]") - console.print(f"Index root: [dim]{index_root}[/dim]\n") - - # Embeddings section - console.print("[bold]Dense Embeddings (HNSW):[/bold]") - if embeddings_result.get("success"): - data = embeddings_result["result"] - root = data.get("root") or data - subtree = data.get("subtree") or {} - centralized = data.get("centralized") or {} - - console.print(f" Root files: {root.get('total_files', 0)}") - console.print( - f" Root files with embeddings: " - f"[{'green' if root.get('has_embeddings') else 'yellow'}]{root.get('files_with_embeddings', 0)}[/]" - f"/{root.get('total_files', 0)}" - ) - console.print(f" Root coverage: {root.get('coverage_percent', 0):.1f}%") - console.print(f" Root chunks: {root.get('total_chunks', 0):,}") - console.print(f" Root storage mode: {root.get('storage_mode', 'none')}") - console.print( - f" Centralized dense: " - f"{'ready' if centralized.get('dense_ready') else ('present' if centralized.get('dense_index_exists') else 'missing')}" - ) - console.print( - f" Centralized binary: " - f"{'ready' if centralized.get('binary_ready') else ('present' if centralized.get('binary_index_exists') else 'missing')}" - ) - - subtree_total = subtree.get("total_indexes", 0) - subtree_with_embeddings = subtree.get("indexes_with_embeddings", 0) - subtree_chunks = subtree.get("total_chunks", 0) - if subtree_total: - console.print("\n[bold]Subtree Summary:[/bold]") - console.print(f" Total indexes: {subtree_total}") - console.print( - f" Indexes with embeddings: " - f"[{'green' if subtree_with_embeddings > 0 else 'yellow'}]{subtree_with_embeddings}[/]/{subtree_total}" - ) - console.print(f" Total chunks: {subtree_chunks:,}") - else: - console.print(f" [yellow]--[/yellow] {embeddings_result.get('error', 'Not available')}") - - -# ==================== Index Update Command ==================== - -@index_app.command("update") -def index_update( - file_path: Path = typer.Argument(..., exists=True, file_okay=True, dir_okay=False, help="Path to the file to update in the index."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Update the index for a single file incrementally. - - This is a lightweight command designed for use in hooks (e.g., Claude Code PostToolUse). - It updates only the specified file without scanning the entire directory. - - The file's parent directory must already be indexed via 'codexlens index init'. - - Examples: - codexlens index update src/main.py # Update single file - codexlens index update ./foo.ts --json # JSON output for hooks - """ - _configure_logging(verbose, json_mode) - - from codexlens.watcher.incremental_indexer import IncrementalIndexer - - registry: RegistryStore | None = None - indexer: IncrementalIndexer | None = None - - try: - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - config = Config.load() - - resolved_path = file_path.resolve() - - # Check if project is indexed - source_root = mapper.get_project_root(resolved_path) - if not source_root or not registry.get_project(source_root): - error_msg = f"Project containing file is not indexed: {file_path}" - if json_mode: - print_json(success=False, error=error_msg) - else: - console.print(f"[red]Error:[/red] {error_msg}") - console.print("[dim]Run 'codexlens index init' on the project root first.[/dim]") - raise typer.Exit(code=1) - - indexer = IncrementalIndexer(registry, mapper, config) - result = indexer._index_file(resolved_path) - - if result.success: - if json_mode: - print_json(success=True, result={ - "path": str(result.path), - "symbols_count": result.symbols_count, - "status": "updated", - }) - else: - console.print(f"[green]✓[/green] Updated index for [bold]{result.path.name}[/bold] ({result.symbols_count} symbols)") - else: - error_msg = result.error or f"Failed to update index for {file_path}" - if json_mode: - print_json(success=False, error=error_msg) - else: - console.print(f"[red]Error:[/red] {error_msg}") - raise typer.Exit(code=1) - - except CodexLensError as exc: - if json_mode: - print_json(success=False, error=str(exc)) - else: - console.print(f"[red]Update failed:[/red] {exc}") - raise typer.Exit(code=1) - finally: - if indexer: - indexer.close() - if registry: - registry.close() - - -# ==================== Index All Command ==================== - -@index_app.command("all") -def index_all( - path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to index."), - language: Optional[List[str]] = typer.Option( - None, - "--language", - "-l", - help="Limit indexing to specific languages (repeat or comma-separated).", - ), - workers: Optional[int] = typer.Option(None, "--workers", "-w", min=1, help="Parallel worker processes."), - force: bool = typer.Option(False, "--force", "-f", help="Force full reindex."), - backend: str = typer.Option("fastembed", "--backend", "-b", help="Embedding backend: fastembed or litellm."), - model: str = typer.Option("code", "--model", "-m", help="Embedding model profile or name."), - max_workers: int = typer.Option(1, "--max-workers", min=1, help="Max concurrent API calls."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """Run all indexing operations in sequence (init, embeddings). - - This is a convenience command that runs the complete indexing pipeline: - 1. FTS index initialization (index init) - 2. Dense vector embeddings (index embeddings) - - Examples: - codexlens index all ~/projects/my-app - codexlens index all . --force - codexlens index all . --backend litellm --model text-embedding-3-small - """ - _configure_logging(verbose, json_mode) - - base_path = path.expanduser().resolve() - results = { - "path": str(base_path), - "steps": {}, - } - - # Step 1: Run init - if not json_mode: - console.print(f"[bold]Step 1/2: Initializing FTS index...[/bold]") - - try: - # Import and call the init function directly - from codexlens.config import Config - from codexlens.storage.index_tree import IndexTreeBuilder - - config = Config.load() - languages = _parse_languages(language) - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - builder = IndexTreeBuilder(registry, mapper, config, incremental=not force) - build_result = builder.build( - source_root=base_path, - languages=languages, - workers=workers, - force_full=force, - ) - - results["steps"]["init"] = { - "success": True, - "files_indexed": build_result.total_files, - "dirs_indexed": build_result.total_dirs, - "index_root": str(build_result.index_root), - } - - if not json_mode: - console.print(f" [green]OK[/green] Indexed {build_result.total_files} files in {build_result.total_dirs} directories") - - index_root = Path(build_result.index_root) - registry.close() - - except Exception as e: - results["steps"]["init"] = {"success": False, "error": str(e)} - if json_mode: - print_json(success=False, result=results, error=f"Init failed: {e}") - else: - console.print(f" [red]Error:[/red] {e}") - raise typer.Exit(code=1) - - # Step 2: Generate embeddings - if not json_mode: - console.print(f"\n[bold]Step 2/2: Generating dense embeddings...[/bold]") - - try: - from codexlens.cli.embedding_manager import generate_dense_embeddings_centralized - - def progress_update(msg: str): - if not json_mode and verbose: - console.print(f" {msg}") - - embed_result = generate_dense_embeddings_centralized( - index_root, - embedding_backend=backend, - model_profile=model, - force=force, - chunk_size=2000, - progress_callback=progress_update, - max_workers=max_workers, - ) - - if embed_result["success"]: - data = embed_result["result"] - results["steps"]["embeddings"] = { - "success": True, - "chunks_created": data.get("chunks_created", 0), - "files_processed": data.get("files_processed", 0), - } - if not json_mode: - console.print(f" [green]OK[/green] Generated {data.get('chunks_created', 0)} chunks for {data.get('files_processed', 0)} files") - else: - results["steps"]["embeddings"] = { - "success": False, - "error": embed_result.get("error"), - } - if not json_mode: - console.print(f" [yellow]Warning:[/yellow] {embed_result.get('error', 'Unknown error')}") - - except Exception as e: - results["steps"]["embeddings"] = {"success": False, "error": str(e)} - if not json_mode: - console.print(f" [yellow]Warning:[/yellow] {e}") - - # Summary - if json_mode: - print_json(success=True, result=results) - else: - console.print(f"\n[bold]Indexing Complete[/bold]") - init_ok = results["steps"].get("init", {}).get("success", False) - emb_ok = results["steps"].get("embeddings", {}).get("success", False) - console.print(f" FTS Index: {'[green]OK[/green]' if init_ok else '[red]Failed[/red]'}") - console.print(f" Embeddings: {'[green]OK[/green]' if emb_ok else '[yellow]Partial/Skipped[/yellow]'}") - - -# ==================== Index Migration Commands ==================== - -# Index version for migration tracking (file-based version marker) -INDEX_FORMAT_VERSION = "2.0" -INDEX_VERSION_FILE = "_index_version.txt" - - -def _get_index_version(index_root: Path) -> Optional[str]: - """Read index format version from version marker file. - - Args: - index_root: Root directory of the index - - Returns: - Version string if file exists, None otherwise - """ - version_file = index_root / INDEX_VERSION_FILE - if version_file.exists(): - try: - return version_file.read_text(encoding="utf-8").strip() - except Exception: - return None - return None - - -def _set_index_version(index_root: Path, version: str) -> None: - """Write index format version to version marker file. - - Args: - index_root: Root directory of the index - version: Version string to write - """ - version_file = index_root / INDEX_VERSION_FILE - version_file.write_text(version, encoding="utf-8") - - -def _discover_distributed_hnsw(index_root: Path) -> List[Dict[str, Any]]: - """Discover distributed HNSW index files. - - Scans for .hnsw files that are stored alongside _index.db files. - This is the old distributed format that needs migration. - - Args: - index_root: Root directory to scan - - Returns: - List of dicts with hnsw_path, size_bytes - """ - results = [] - - for hnsw_path in index_root.rglob("*.hnsw"): - try: - size = hnsw_path.stat().st_size - results.append({ - "hnsw_path": hnsw_path, - "size_bytes": size, - }) - except Exception: - pass - - return results - - -def _check_centralized_storage(index_root: Path) -> Dict[str, Any]: - """Check for centralized storage files. - - Args: - index_root: Root directory to check - - Returns: - Dict with has_vectors, vector_stats - """ - from codexlens.config import VECTORS_HNSW_NAME - - vectors_hnsw = index_root / VECTORS_HNSW_NAME - - result = { - "has_vectors": vectors_hnsw.exists(), - "vectors_path": str(vectors_hnsw) if vectors_hnsw.exists() else None, - "vector_stats": None, - } - - # Get vector stats if exists - if vectors_hnsw.exists(): - try: - result["vector_stats"] = { - "size_bytes": vectors_hnsw.stat().st_size, - } - except Exception: - pass - - return result - - -@index_app.command("migrate") -def index_migrate_cmd( - path: Annotated[Optional[str], typer.Argument(help="Project path to migrate")] = None, - dry_run: Annotated[bool, typer.Option("--dry-run", help="Show what would be migrated without making changes")] = False, - force: Annotated[bool, typer.Option("--force", help="Force migration even if already migrated")] = False, - json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False, - verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose output")] = False, -) -> None: - """Migrate old distributed index to new centralized architecture. - - This command upgrades indexes from the old distributed storage format - (where vectors were stored in each _index.db) to the new centralized - format (single _vectors.hnsw at index root). - - Migration Steps: - 1. Detect if migration is needed (check version marker) - 2. Discover distributed .hnsw files - 3. Report current status - 4. Create version marker (unless --dry-run) - - Use --dry-run to preview what would be migrated without making changes. - Use --force to re-run migration even if version marker exists. - - Note: For full data migration (vectors consolidation), run: - codexlens index embeddings --force - - Examples: - codexlens index migrate ~/projects/my-app --dry-run - codexlens index migrate . --force - codexlens index migrate --json - """ - _configure_logging(verbose, json_mode) - - # Resolve target path - if path: - target_path = Path(path).expanduser().resolve() - else: - target_path = Path.cwd() - - if not target_path.exists(): - if json_mode: - print_json(success=False, error=f"Path does not exist: {target_path}") - else: - console.print(f"[red]Error:[/red] Path does not exist: {target_path}") - raise typer.Exit(code=1) - - # Find index root - registry: RegistryStore | None = None - index_root: Optional[Path] = None - - try: - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - # Check if path is a project with an index - project_info = registry.get_project(target_path) - if project_info: - index_root = Path(project_info.index_root) - else: - # Try to find index via mapper - index_db = mapper.source_to_index_db(target_path) - if index_db.exists(): - index_root = index_db.parent - finally: - if registry: - registry.close() - - if not index_root or not index_root.exists(): - if json_mode: - print_json(success=False, error=f"No index found for: {target_path}") - else: - console.print(f"[red]Error:[/red] No index found for: {target_path}") - console.print("[dim]Run 'codexlens init' first to create an index.[/dim]") - raise typer.Exit(code=1) - - if not json_mode: - console.print(f"[bold]Index Migration Check[/bold]") - console.print(f"Source path: [dim]{target_path}[/dim]") - console.print(f"Index root: [dim]{index_root}[/dim]") - if dry_run: - console.print("[yellow]Mode: DRY RUN (no changes will be made)[/yellow]") - console.print() - - # Check current version - current_version = _get_index_version(index_root) - needs_migration = current_version is None or (force and current_version != INDEX_FORMAT_VERSION) - - if current_version and current_version >= INDEX_FORMAT_VERSION and not force: - result = { - "path": str(target_path), - "index_root": str(index_root), - "current_version": current_version, - "target_version": INDEX_FORMAT_VERSION, - "needs_migration": False, - "message": "Index is already at the latest version", - } - - if json_mode: - print_json(success=True, result=result) - else: - console.print(f"[green]OK[/green] Index is already at version {current_version}") - console.print("[dim]No migration needed. Use --force to re-run migration.[/dim]") - return - - # Discover distributed data - distributed_hnsw = _discover_distributed_hnsw(index_root) - centralized = _check_centralized_storage(index_root) - - # Count all _index.db files - all_index_dbs = list(index_root.rglob("_index.db")) - - # Build migration report - migration_report = { - "path": str(target_path), - "index_root": str(index_root), - "dry_run": dry_run, - "current_version": current_version, - "target_version": INDEX_FORMAT_VERSION, - "needs_migration": needs_migration, - "discovery": { - "total_index_dbs": len(all_index_dbs), - "distributed_hnsw_count": len(distributed_hnsw), - "distributed_hnsw_total_bytes": sum(d["size_bytes"] for d in distributed_hnsw), - }, - "centralized": centralized, - "recommendations": [], - } - - # Generate recommendations - if distributed_hnsw and not centralized["has_vectors"]: - migration_report["recommendations"].append( - f"Run 'codexlens embeddings-generate {target_path} --recursive --force' to consolidate vector data" - ) - - if not distributed_hnsw: - migration_report["recommendations"].append( - "No distributed data found. Index may already be using centralized storage." - ) - - if json_mode: - # Perform migration action (set version marker) unless dry-run - if not dry_run and needs_migration: - _set_index_version(index_root, INDEX_FORMAT_VERSION) - migration_report["migrated"] = True - migration_report["new_version"] = INDEX_FORMAT_VERSION - else: - migration_report["migrated"] = False - - print_json(success=True, result=migration_report) - else: - # Display discovery results - console.print("[bold]Discovery Results:[/bold]") - console.print(f" Total _index.db files: {len(all_index_dbs)}") - console.print() - - # Distributed HNSW - console.print("[bold]Distributed HNSW Files:[/bold]") - if distributed_hnsw: - total_size = sum(d["size_bytes"] for d in distributed_hnsw) - console.print(f" Found {len(distributed_hnsw)} .hnsw files") - console.print(f" Total size: {total_size / (1024 * 1024):.1f} MB") - if verbose: - for d in distributed_hnsw[:5]: - console.print(f" [dim]{d['hnsw_path'].name}: {d['size_bytes'] / 1024:.1f} KB[/dim]") - if len(distributed_hnsw) > 5: - console.print(f" [dim]... and {len(distributed_hnsw) - 5} more[/dim]") - else: - console.print(" [dim]None found (already centralized or not generated)[/dim]") - console.print() - - # Centralized storage status - console.print("[bold]Centralized Storage:[/bold]") - if centralized["has_vectors"]: - stats = centralized.get("vector_stats") or {} - size_mb = stats.get("size_bytes", 0) / (1024 * 1024) - console.print(f" [green]OK[/green] _vectors.hnsw exists ({size_mb:.1f} MB)") - else: - console.print(f" [yellow]--[/yellow] _vectors.hnsw not found") - console.print() - - # Migration action - if not dry_run and needs_migration: - _set_index_version(index_root, INDEX_FORMAT_VERSION) - console.print(f"[green]OK[/green] Version marker created: {INDEX_FORMAT_VERSION}") - elif dry_run: - console.print(f"[yellow]DRY RUN:[/yellow] Would create version marker: {INDEX_FORMAT_VERSION}") - - # Recommendations - if migration_report["recommendations"]: - console.print("\n[bold]Recommendations:[/bold]") - for rec in migration_report["recommendations"]: - console.print(f" [cyan]>[/cyan] {rec}") - - -# ==================== Deprecated Command Aliases ==================== -# These commands maintain backward compatibility with the old CLI structure. -# They display deprecation warnings and delegate to the new `index` subcommands. - - -@app.command("embeddings-generate", hidden=True, deprecated=True) -def embeddings_generate_deprecated( - path: Path = typer.Argument( - ..., - exists=True, - help="Path to _index.db file or project directory.", - ), - backend: str = typer.Option( - "fastembed", - "--backend", - "-b", - help="Embedding backend: fastembed (local) or litellm (remote API).", - ), - model: str = typer.Option( - "code", - "--model", - "-m", - help="Model: profile name for fastembed or model name for litellm.", - ), - force: bool = typer.Option( - False, - "--force", - "-f", - help="Force regeneration even if embeddings exist.", - ), - chunk_size: int = typer.Option( - 2000, - "--chunk-size", - help="Maximum chunk size in characters.", - ), - max_workers: int = typer.Option( - 1, - "--max-workers", - "-w", - min=1, - help="Max concurrent API calls.", - ), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."), - centralized: bool = typer.Option( - False, - "--centralized", - "-c", - help="Use centralized vector storage (default).", - ), - distributed: bool = typer.Option( - False, - "--distributed", - "-d", - help="Use distributed per-directory indexes.", - ), -) -> None: - """[Deprecated] Use 'codexlens index embeddings' instead.""" - _deprecated_command_warning("embeddings-generate", "index embeddings") - if centralized and distributed: - _fail_mutually_exclusive("--centralized", "--distributed", json_mode) - index_embeddings( - path=path, - backend=backend, - model=model, - force=force, - chunk_size=chunk_size, - max_workers=max_workers, - json_mode=json_mode, - verbose=verbose, - centralized=centralized, - distributed=distributed, - ) - - -@app.command("init", hidden=True, deprecated=True) -def init_deprecated( - path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to index."), - language: Optional[List[str]] = typer.Option(None, "--language", "-l", help="Limit indexing to specific languages."), - workers: Optional[int] = typer.Option(None, "--workers", "-w", min=1, help="Parallel worker processes."), - force: bool = typer.Option(False, "--force", "-f", help="Force full reindex."), - no_embeddings: bool = typer.Option(False, "--no-embeddings", help="Skip automatic embedding generation."), - backend: str = typer.Option("fastembed", "--backend", "-b", help="Embedding backend."), - model: str = typer.Option("code", "--model", "-m", help="Embedding model."), - max_workers: int = typer.Option(1, "--max-workers", min=1, help="Max concurrent API calls."), - json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), - verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), -) -> None: - """[Deprecated] Use 'codexlens index init' instead.""" - _deprecated_command_warning("init", "index init") - index_init( - path=path, - language=language, - workers=workers, - force=force, - no_embeddings=no_embeddings, - backend=backend, - model=model, - max_workers=max_workers, - json_mode=json_mode, - verbose=verbose, - ) - - - -@app.command("cascade-index", hidden=True, deprecated=True) -def cascade_index_deprecated( - path: Annotated[Path, typer.Argument(help="Directory to index")], - force: Annotated[bool, typer.Option("--force", "-f", help="Force regenerate")] = False, - batch_size: Annotated[int, typer.Option("--batch-size", "-b", help="Batch size for embedding")] = 32, - json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False, - verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")] = False, -) -> None: - """[Deprecated] Use 'codexlens index binary' instead.""" - _deprecated_command_warning("cascade-index", "index binary") - index_binary( - path=path, - force=force, - batch_size=batch_size, - json_mode=json_mode, - verbose=verbose, - ) - - -@app.command("index-migrate", hidden=True, deprecated=True) -def index_migrate_deprecated( - path: Annotated[Optional[str], typer.Argument(help="Project path to migrate")] = None, - dry_run: Annotated[bool, typer.Option("--dry-run", help="Show what would be migrated")] = False, - force: Annotated[bool, typer.Option("--force", help="Force migration")] = False, - json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False, - verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose output")] = False, -) -> None: - """[Deprecated] Use 'codexlens index migrate' instead.""" - _deprecated_command_warning("index-migrate", "index migrate") - index_migrate_cmd( - path=path, - dry_run=dry_run, - force=force, - json_mode=json_mode, - verbose=verbose, - ) - - -# ==================== DeepWiki Commands ==================== - -deepwiki_app = typer.Typer(help="DeepWiki documentation generation commands") -app.add_typer(deepwiki_app, name="deepwiki") - - -@deepwiki_app.command("generate") -def deepwiki_generate( - path: Annotated[Path, typer.Argument(help="File or directory to generate docs for")] = Path("."), - force: Annotated[bool, typer.Option("--force", "-f", help="Force regeneration")] = False, - json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False, - verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")] = False, -) -> None: - """Generate DeepWiki documentation for source files. - - Scans source code, extracts symbols, and generates Markdown documentation - with incremental updates using SHA256 hashes for change detection. - - Examples: - codexlens deepwiki generate ./src - codexlens deepwiki generate ./src/auth.py - """ - from codexlens.tools.deepwiki_generator import DeepWikiGenerator - - _configure_logging(verbose, json_mode) - - path = Path(path).resolve() - if not path.exists(): - msg = f"Path not found: {path}" - if json_mode: - print_json(success=False, error=msg) - else: - console.print(f"[red]Error:[/red] {msg}") - raise typer.Exit(code=1) - - try: - generator = DeepWikiGenerator() - result = generator.run(path) - - if json_mode: - print_json(success=True, result=result) - else: - console.print(f"[green]DeepWiki generation complete:[/green]") - console.print(f" Files processed: {result['processed_files']}/{result['total_files']}") - console.print(f" Symbols found: {result['total_symbols']}") - console.print(f" Docs generated: {result['docs_generated']}") - if result['skipped_files'] > 0: - console.print(f" Files skipped (unchanged): {result['skipped_files']}") - - except Exception as e: - msg = f"DeepWiki generation failed: {e}" - if json_mode: - print_json(success=False, error=msg) - else: - console.print(f"[red]Error:[/red] {msg}") - raise typer.Exit(code=1) - - -@deepwiki_app.command("status") -def deepwiki_status( - json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False, - verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")] = False, -) -> None: - """Show DeepWiki documentation status. - - Displays statistics about indexed files and generated documentation. - """ - from codexlens.storage.deepwiki_store import DeepWikiStore - - _configure_logging(verbose, json_mode) - - try: - store = DeepWikiStore() - stats = store.get_stats() - - if json_mode: - print_json(success=True, result=stats) - else: - console.print("[cyan]DeepWiki Status:[/cyan]") - console.print(f" Files tracked: {stats.get('files_count', 0)}") - console.print(f" Symbols indexed: {stats.get('symbols_count', 0)}") - console.print(f" Docs generated: {stats.get('docs_count', 0)}") - - except Exception as e: - msg = f"Failed to get DeepWiki status: {e}" - if json_mode: - print_json(success=False, error=msg) - else: - console.print(f"[red]Error:[/red] {msg}") - raise typer.Exit(code=1) diff --git a/codex-lens/src/codexlens/cli/embedding_manager.py b/codex-lens/src/codexlens/cli/embedding_manager.py deleted file mode 100644 index 8bbb3a74..00000000 --- a/codex-lens/src/codexlens/cli/embedding_manager.py +++ /dev/null @@ -1,2377 +0,0 @@ -"""Embedding Manager - Manage semantic embeddings for code indexes. - -This module provides functions for generating and managing semantic embeddings -for code indexes, supporting both fastembed and litellm backends. - -Example Usage: - Generate embeddings for a single index: - - >>> from pathlib import Path - >>> from codexlens.cli.embedding_manager import generate_embeddings - >>> result = generate_embeddings( - ... index_path=Path("path/to/_index.db"), - ... force=True - ... ) - >>> if result["success"]: - ... print(f"Generated {result['total_chunks_created']} embeddings") - - Generate embeddings for an entire project with centralized index: - - >>> from codexlens.cli.embedding_manager import generate_dense_embeddings_centralized - >>> result = generate_dense_embeddings_centralized( - ... index_root=Path("path/to/project"), - ... force=True, - ... progress_callback=lambda msg: print(msg) - ... ) - - Check if embeddings exist: - - >>> from codexlens.cli.embedding_manager import check_index_embeddings - >>> status = check_index_embeddings(Path("path/to/_index.db")) - >>> print(status["result"]["has_embeddings"]) - -Backward Compatibility: - The deprecated `discover_all_index_dbs()` function is maintained for compatibility. - `generate_embeddings_recursive()` is deprecated but functional; use - `generate_dense_embeddings_centralized()` instead. - The `EMBEDDING_BATCH_SIZE` constant is kept as a reference but actual batch size - is calculated dynamically via `calculate_dynamic_batch_size()`. -""" - -import gc -import json -import logging -import sqlite3 -import time -from concurrent.futures import ThreadPoolExecutor, as_completed -from itertools import islice -from pathlib import Path -from typing import Any, Dict, Generator, List, Optional, Tuple - -from codexlens.storage.index_filters import filter_index_paths - -try: - from codexlens.semantic import SEMANTIC_AVAILABLE, is_embedding_backend_available -except ImportError: - SEMANTIC_AVAILABLE = False - def is_embedding_backend_available(_backend: str): # type: ignore[no-redef] - return False, "codexlens.semantic not available" - -try: - from codexlens.semantic.vector_store import VectorStore -except ImportError: # pragma: no cover - VectorStore = None # type: ignore[assignment] - -try: - from codexlens.config import ( - BINARY_VECTORS_MMAP_NAME, - VECTORS_HNSW_NAME, - VECTORS_META_DB_NAME, - ) -except ImportError: - VECTORS_HNSW_NAME = "_vectors.hnsw" - VECTORS_META_DB_NAME = "_vectors_meta.db" - BINARY_VECTORS_MMAP_NAME = "_binary_vectors.mmap" - -try: - from codexlens.search.ranking import get_file_category -except ImportError: - def get_file_category(path: str): # type: ignore[no-redef] - """Fallback: map common extensions to category.""" - ext = Path(path).suffix.lower() - code_exts = {".py", ".js", ".jsx", ".ts", ".tsx", ".java", ".go", ".c", ".cpp", ".rs"} - doc_exts = {".md", ".mdx", ".txt", ".rst"} - if ext in code_exts: - return "code" - elif ext in doc_exts: - return "doc" - return None - -logger = logging.getLogger(__name__) - -# Embedding batch size - larger values improve throughput on modern hardware -# Benchmark: 256 gives ~2.35x speedup over 64 with DirectML GPU acceleration -EMBEDDING_BATCH_SIZE = 256 - - -def calculate_dynamic_batch_size(config, embedder) -> int: - """Calculate batch size dynamically based on model token capacity. - - This function computes an optimal batch size by considering: - - Maximum chunk character size from parsing rules - - Estimated tokens per chunk (chars / chars_per_token_estimate) - - Model's maximum token capacity - - Utilization factor (default 80% to leave headroom) - - Args: - config: Config object with api_batch_size_* settings. - embedder: Embedding model object with max_tokens property. - - Returns: - int: Calculated batch size, clamped to [1, api_batch_size_max]. - """ - # If dynamic calculation is disabled, return static value - if not getattr(config, 'api_batch_size_dynamic', False): - return getattr(config, 'api_batch_size', 8) - - # Get maximum chunk character size from ALL parsing rules (not just default) - # This ensures we use the worst-case chunk size across all languages - parsing_rules = getattr(config, 'parsing_rules', {}) - all_max_chunk_chars = [ - rule.get('max_chunk_chars', 0) - for rule in parsing_rules.values() - if isinstance(rule, dict) - ] - max_chunk_chars = max(all_max_chunk_chars) if all_max_chunk_chars else 4000 - if max_chunk_chars <= 0: - max_chunk_chars = 4000 # Final fallback - - # Get characters per token estimate - chars_per_token = getattr(config, 'chars_per_token_estimate', 4) - if chars_per_token <= 0: - chars_per_token = 4 # Safe default - - # Estimate tokens per chunk - estimated_tokens_per_chunk = max_chunk_chars / chars_per_token - - # Prevent division by zero - if estimated_tokens_per_chunk <= 0: - return getattr(config, 'api_batch_size', 8) - - # Get model's maximum token capacity - model_max_tokens = getattr(embedder, 'max_tokens', 8192) - - # Get utilization factor (default 80%, max 95% to leave safety margin) - utilization_factor = getattr(config, 'api_batch_size_utilization_factor', 0.8) - if utilization_factor <= 0 or utilization_factor > 0.95: - if utilization_factor > 0.95: - logger.warning( - "Utilization factor %.2f exceeds safe limit 0.95. " - "Token estimation is approximate, high values risk API errors. " - "Clamping to 0.95.", - utilization_factor - ) - utilization_factor = 0.95 - else: - utilization_factor = 0.8 - - # Calculate safe token limit - safe_token_limit = model_max_tokens * utilization_factor - - # Calculate dynamic batch size - dynamic_batch_size = int(safe_token_limit / estimated_tokens_per_chunk) - - # Get maximum batch size limit - batch_size_max = getattr(config, 'api_batch_size_max', 2048) - - # Clamp to [1, batch_size_max] - result = max(1, min(dynamic_batch_size, batch_size_max)) - - logger.debug( - "Dynamic batch size calculated: %d (max_chunk_chars=%d, chars_per_token=%d, " - "model_max_tokens=%d, utilization=%.1f%%, limit=%d)", - result, max_chunk_chars, chars_per_token, model_max_tokens, - utilization_factor * 100, batch_size_max - ) - - return result - - -def _build_categories_from_batch(chunk_batch: List[Tuple[Any, str]]) -> List[str]: - """Build categories list from chunk batch for index-level category filtering. - - Args: - chunk_batch: List of (chunk, file_path) tuples - - Returns: - List of category strings ('code' or 'doc'), defaulting to 'code' for unknown - """ - categories = [] - for _, file_path in chunk_batch: - cat = get_file_category(file_path) - categories.append(cat if cat else "code") # Default to 'code' for unknown extensions - return categories - - -def _cleanup_fastembed_resources() -> None: - """Best-effort cleanup for fastembed/ONNX resources (no-op for other backends).""" - try: - from codexlens.semantic.embedder import clear_embedder_cache - clear_embedder_cache() - except (ImportError, AttributeError): - # Expected when semantic module unavailable or cache function doesn't exist - pass - except Exception as exc: - # Log unexpected errors but don't fail cleanup - logger.debug(f"Unexpected error during fastembed cleanup: {exc}") - - -def _generate_chunks_from_cursor( - cursor, - chunker, - path_column: str, - file_batch_size: int, - failed_files: List[Tuple[str, str]], -) -> Generator[Tuple, None, Tuple[int, int]]: - """Generator that yields chunks from database cursor in a streaming fashion. - - This avoids loading all chunks into memory at once, significantly reducing - peak memory usage for large codebases. - - Args: - cursor: SQLite cursor with file data - chunker: Chunker instance for splitting files - path_column: Column name for file path - file_batch_size: Number of files to fetch at a time - failed_files: List to append failed files to - - Yields: - (chunk, file_path) tuples - - Returns: - (total_files_processed, batch_count) after iteration completes - """ - total_files = 0 - batch_count = 0 - - while True: - file_batch = cursor.fetchmany(file_batch_size) - if not file_batch: - break - - batch_count += 1 - - for file_row in file_batch: - file_path = file_row[path_column] - content = file_row["content"] - language = file_row["language"] or "python" - - try: - chunks = chunker.chunk_sliding_window( - content, - file_path=file_path, - language=language - ) - if chunks: - total_files += 1 - for chunk in chunks: - yield (chunk, file_path) - except (OSError, UnicodeDecodeError) as e: - # File access or encoding errors - logger.error(f"Failed to read file {file_path}: {e}") - failed_files.append((file_path, f"File read error: {e}")) - except ValueError as e: - # Chunking configuration errors - logger.error(f"Chunking config error for {file_path}: {e}") - failed_files.append((file_path, f"Chunking error: {e}")) - except Exception as e: - # Other unexpected errors - logger.error(f"Unexpected error processing {file_path}: {e}") - failed_files.append((file_path, f"Unexpected error: {e}")) - - -def _create_token_aware_batches( - chunk_generator: Generator, - max_tokens_per_batch: int = 8000, -) -> Generator[List[Tuple], None, None]: - """Group chunks by total token count instead of fixed count. - - Uses fast token estimation (len(content) // 4) for efficiency. - Yields batches when approaching the token limit. - - Args: - chunk_generator: Generator yielding (chunk, file_path) tuples - max_tokens_per_batch: Maximum tokens per batch (default: 8000) - - Yields: - List of (chunk, file_path) tuples representing a batch - """ - current_batch = [] - current_tokens = 0 - - for chunk, file_path in chunk_generator: - # Fast token estimation: len(content) // 4 - chunk_tokens = len(chunk.content) // 4 - - # If adding this chunk would exceed limit and we have items, yield current batch - if current_tokens + chunk_tokens > max_tokens_per_batch and current_batch: - yield current_batch - current_batch = [] - current_tokens = 0 - - # Add chunk to current batch - current_batch.append((chunk, file_path)) - current_tokens += chunk_tokens - - # Yield final batch if not empty - if current_batch: - yield current_batch - - -def _get_path_column(conn: sqlite3.Connection) -> str: - """Detect whether files table uses 'path' or 'full_path' column. - - Args: - conn: SQLite connection to the index database - - Returns: - Column name ('path' or 'full_path') - - Raises: - ValueError: If neither column exists in files table - """ - cursor = conn.execute("PRAGMA table_info(files)") - columns = {row[1] for row in cursor.fetchall()} - if 'full_path' in columns: - return 'full_path' - elif 'path' in columns: - return 'path' - raise ValueError("files table has neither 'path' nor 'full_path' column") - - -def check_index_embeddings(index_path: Path) -> Dict[str, any]: - """Check if an index has embeddings and return statistics. - - Args: - index_path: Path to _index.db file - - Returns: - Dictionary with embedding statistics and status - """ - if not index_path.exists(): - return { - "success": False, - "error": f"Index not found: {index_path}", - } - - try: - with sqlite3.connect(index_path) as conn: - # Check if semantic_chunks table exists - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'" - ) - table_exists = cursor.fetchone() is not None - - if not table_exists: - # Count total indexed files even without embeddings - cursor = conn.execute("SELECT COUNT(*) FROM files") - total_files = cursor.fetchone()[0] - - return { - "success": True, - "result": { - "has_embeddings": False, - "total_chunks": 0, - "total_files": total_files, - "files_with_chunks": 0, - "files_without_chunks": total_files, - "coverage_percent": 0.0, - "missing_files_sample": [], - "index_path": str(index_path), - }, - } - - # Count total chunks - cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks") - total_chunks = cursor.fetchone()[0] - - # Count total indexed files - cursor = conn.execute("SELECT COUNT(*) FROM files") - total_files = cursor.fetchone()[0] - - # Count files with embeddings - cursor = conn.execute( - "SELECT COUNT(DISTINCT file_path) FROM semantic_chunks" - ) - files_with_chunks = cursor.fetchone()[0] - - # Get a sample of files without embeddings - path_column = _get_path_column(conn) - cursor = conn.execute(f""" - SELECT {path_column} - FROM files - WHERE {path_column} NOT IN ( - SELECT DISTINCT file_path FROM semantic_chunks - ) - LIMIT 5 - """) - missing_files = [row[0] for row in cursor.fetchall()] - - return { - "success": True, - "result": { - "has_embeddings": total_chunks > 0, - "total_chunks": total_chunks, - "total_files": total_files, - "files_with_chunks": files_with_chunks, - "files_without_chunks": total_files - files_with_chunks, - "coverage_percent": round((files_with_chunks / total_files * 100) if total_files > 0 else 0, 1), - "missing_files_sample": missing_files, - "index_path": str(index_path), - }, - } - - except Exception as e: - return { - "success": False, - "error": f"Failed to check embeddings: {str(e)}", - } - - -def _sqlite_table_exists(conn: sqlite3.Connection, table_name: str) -> bool: - """Return whether a SQLite table exists.""" - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name=?", - (table_name,), - ) - return cursor.fetchone() is not None - - -def _sqlite_count_rows(conn: sqlite3.Connection, table_name: str) -> int: - """Return row count for a table, or 0 when the table is absent.""" - if not _sqlite_table_exists(conn, table_name): - return 0 - cursor = conn.execute(f"SELECT COUNT(*) FROM {table_name}") - return int(cursor.fetchone()[0] or 0) - - -def _sqlite_count_distinct_rows(conn: sqlite3.Connection, table_name: str, column_name: str) -> int: - """Return distinct row count for a table column, or 0 when the table is absent.""" - if not _sqlite_table_exists(conn, table_name): - return 0 - cursor = conn.execute(f"SELECT COUNT(DISTINCT {column_name}) FROM {table_name}") - return int(cursor.fetchone()[0] or 0) - - -def _get_model_info_from_index(index_path: Path) -> Optional[Dict[str, Any]]: - """Read embedding model metadata from an index if available.""" - try: - with sqlite3.connect(index_path) as conn: - if not _sqlite_table_exists(conn, "embeddings_config"): - return None - from codexlens.semantic.vector_store import VectorStore - with VectorStore(index_path) as vs: - config = vs.get_model_config() - if not config: - return None - return { - "model_profile": config.get("model_profile"), - "model_name": config.get("model_name"), - "embedding_dim": config.get("embedding_dim"), - "backend": config.get("backend"), - "created_at": config.get("created_at"), - "updated_at": config.get("updated_at"), - } - except Exception: - return None - - -def _inspect_centralized_embeddings(index_root: Path) -> Dict[str, Any]: - """Inspect centralized vector artifacts stored directly at the current root.""" - dense_index_path = index_root / VECTORS_HNSW_NAME - meta_db_path = index_root / VECTORS_META_DB_NAME - binary_index_path = index_root / BINARY_VECTORS_MMAP_NAME - - result: Dict[str, Any] = { - "index_root": str(index_root), - "dense_index_path": str(dense_index_path) if dense_index_path.exists() else None, - "binary_index_path": str(binary_index_path) if binary_index_path.exists() else None, - "meta_db_path": str(meta_db_path) if meta_db_path.exists() else None, - "dense_index_exists": dense_index_path.exists(), - "binary_index_exists": binary_index_path.exists(), - "meta_db_exists": meta_db_path.exists(), - "chunk_metadata_rows": 0, - "binary_vector_rows": 0, - "files_with_embeddings": 0, - "dense_ready": False, - "binary_ready": False, - "usable": False, - } - - if not meta_db_path.exists(): - return result - - try: - with sqlite3.connect(meta_db_path) as conn: - result["chunk_metadata_rows"] = _sqlite_count_rows(conn, "chunk_metadata") - result["binary_vector_rows"] = _sqlite_count_rows(conn, "binary_vectors") - result["files_with_embeddings"] = _sqlite_count_distinct_rows(conn, "chunk_metadata", "file_path") - except Exception as exc: - result["error"] = f"Failed to inspect centralized metadata: {exc}" - return result - - result["dense_ready"] = result["dense_index_exists"] and result["chunk_metadata_rows"] > 0 - result["binary_ready"] = ( - result["binary_index_exists"] - and result["chunk_metadata_rows"] > 0 - and result["binary_vector_rows"] > 0 - ) - result["usable"] = result["dense_ready"] or result["binary_ready"] - return result - - -def _get_embedding_defaults() -> tuple[str, str, bool, List, str, float]: - """Get default embedding settings from config. - - Returns: - Tuple of (backend, model, use_gpu, endpoints, strategy, cooldown) - """ - try: - from codexlens.config import Config - config = Config.load() - return ( - config.embedding_backend, - config.embedding_model, - config.embedding_use_gpu, - config.embedding_endpoints, - config.embedding_strategy, - config.embedding_cooldown, - ) - except (ImportError, AttributeError, OSError, ValueError) as exc: - # Config not available or malformed - use defaults - logger.debug(f"Using default embedding config (config load failed): {exc}") - return "fastembed", "code", True, [], "latency_aware", 60.0 - except Exception as exc: - # Unexpected error - still use defaults but log - logger.warning(f"Unexpected error loading embedding config: {exc}") - return "fastembed", "code", True, [], "latency_aware", 60.0 - - -def _apply_embedding_config_defaults( - embedding_backend: Optional[str], - model_profile: Optional[str], - use_gpu: Optional[bool], - endpoints: Optional[List], - strategy: Optional[str], - cooldown: Optional[float], -) -> tuple[str, str, bool, List, str, float]: - """Apply config defaults to embedding parameters. - - This helper function reduces code duplication across embedding generation - functions by centralizing the default value application logic. - - Args: - embedding_backend: Embedding backend (fastembed/litellm) or None for default - model_profile: Model profile/name or None for default - use_gpu: GPU flag or None for default - endpoints: API endpoints list or None for default - strategy: Selection strategy or None for default - cooldown: Cooldown seconds or None for default - - Returns: - Tuple of (backend, model, use_gpu, endpoints, strategy, cooldown) with - defaults applied where None was passed. - """ - (default_backend, default_model, default_gpu, - default_endpoints, default_strategy, default_cooldown) = _get_embedding_defaults() - - backend = embedding_backend if embedding_backend is not None else default_backend - model = model_profile if model_profile is not None else default_model - gpu = use_gpu if use_gpu is not None else default_gpu - eps = endpoints if endpoints is not None else default_endpoints - strat = strategy if strategy is not None else default_strategy - cool = cooldown if cooldown is not None else default_cooldown - - return backend, model, gpu, eps, strat, cool - - -def _calculate_max_workers( - embedding_backend: str, - endpoints: Optional[List], - max_workers: Optional[int], -) -> int: - """Calculate optimal max_workers based on backend and endpoint count. - - Args: - embedding_backend: The embedding backend being used - endpoints: List of API endpoints (for litellm multi-endpoint mode) - max_workers: Explicitly specified max_workers or None for auto-calculation - - Returns: - Calculated or specified max_workers value - """ - if max_workers is not None: - return max_workers - - endpoint_count = len(endpoints) if endpoints else 1 - - # Set dynamic max_workers default based on backend type and endpoint count - # - FastEmbed: CPU-bound, sequential is optimal (1 worker) - # - LiteLLM single endpoint: 4 workers default - # - LiteLLM multi-endpoint: workers = endpoint_count * 2 (to saturate all APIs) - if embedding_backend == "litellm": - if endpoint_count > 1: - return endpoint_count * 2 # No cap, scale with endpoints - else: - return 4 - else: - return 1 - - -def _initialize_embedder_and_chunker( - embedding_backend: str, - model_profile: str, - use_gpu: bool, - endpoints: Optional[List], - strategy: str, - cooldown: float, - chunk_size: int, - overlap: int, -) -> tuple: - """Initialize embedder and chunker for embedding generation. - - This helper function reduces code duplication by centralizing embedder - and chunker initialization logic. - - Args: - embedding_backend: The embedding backend (fastembed/litellm) - model_profile: Model profile or name - use_gpu: Whether to use GPU acceleration - endpoints: Optional API endpoints for load balancing - strategy: Selection strategy for multi-endpoint mode - cooldown: Cooldown seconds for rate-limited endpoints - chunk_size: Maximum chunk size in characters - overlap: Overlap size in characters - - Returns: - Tuple of (embedder, chunker, endpoint_count) - - Raises: - ValueError: If embedding_backend is invalid - """ - from codexlens.semantic.factory import get_embedder as get_embedder_factory - from codexlens.semantic.chunker import Chunker, ChunkConfig - from codexlens.config import Config - - # Initialize embedder using factory (supports fastembed, litellm, and rotational) - # For fastembed: model_profile is a profile name (fast/code/multilingual/balanced) - # For litellm: model_profile is a model name (e.g., qwen3-embedding) - # For multi-endpoint: endpoints list enables load balancing - if embedding_backend == "fastembed": - embedder = get_embedder_factory(backend="fastembed", profile=model_profile, use_gpu=use_gpu) - elif embedding_backend == "litellm": - embedder = get_embedder_factory( - backend="litellm", - model=model_profile, - endpoints=endpoints if endpoints else None, - strategy=strategy, - cooldown=cooldown, - ) - else: - raise ValueError(f"Invalid embedding backend: {embedding_backend}. Must be 'fastembed' or 'litellm'.") - - # skip_token_count=True: Use fast estimation (len/4) instead of expensive tiktoken - # This significantly reduces CPU usage with minimal impact on metadata accuracy - # Load chunk stripping config from settings - chunk_cfg = Config.load() - chunker = Chunker(config=ChunkConfig( - max_chunk_size=chunk_size, - overlap=overlap, - skip_token_count=True, - strip_comments=getattr(chunk_cfg, 'chunk_strip_comments', True), - strip_docstrings=getattr(chunk_cfg, 'chunk_strip_docstrings', True), - )) - - endpoint_count = len(endpoints) if endpoints else 1 - return embedder, chunker, endpoint_count - - -def generate_embeddings( - index_path: Path, - embedding_backend: Optional[str] = None, - model_profile: Optional[str] = None, - force: bool = False, - chunk_size: int = 2000, - overlap: int = 200, - progress_callback: Optional[callable] = None, - use_gpu: Optional[bool] = None, - max_tokens_per_batch: Optional[int] = None, - max_workers: Optional[int] = None, - endpoints: Optional[List] = None, - strategy: Optional[str] = None, - cooldown: Optional[float] = None, -) -> Dict[str, any]: - """Generate embeddings for an index using memory-efficient batch processing. - - This function processes files in small batches to keep memory usage under 2GB, - regardless of the total project size. Supports concurrent API calls for - LiteLLM backend to improve throughput. - - Args: - index_path: Path to _index.db file. - embedding_backend: Embedding backend to use (fastembed or litellm). - Defaults to config setting. - model_profile: Model profile for fastembed (fast, code, multilingual, balanced) - or model name for litellm (e.g., qwen3-embedding). - Defaults to config setting. - force: If True, regenerate even if embeddings exist. - chunk_size: Maximum chunk size in characters. - overlap: Overlap size in characters for sliding window chunking (default: 200). - progress_callback: Optional callback for progress updates. - use_gpu: Whether to use GPU acceleration (fastembed only). - Defaults to config setting. - max_tokens_per_batch: Maximum tokens per batch for token-aware batching. - If None, attempts to get from embedder.max_tokens, - then falls back to 8000. If set, overrides automatic detection. - max_workers: Maximum number of concurrent API calls. - If None, uses dynamic defaults based on backend and endpoint count. - endpoints: Optional list of endpoint configurations for multi-API load balancing. - Each dict has keys: model, api_key, api_base, weight. - strategy: Selection strategy for multi-endpoint mode (round_robin, latency_aware). - cooldown: Default cooldown seconds for rate-limited endpoints. - - Returns: - Dict[str, any]: Result dictionary with generation statistics. - Contains keys: success, error (if failed), files_processed, - total_chunks_created, execution_time, etc. - - Raises: - ValueError: If embedding_backend is invalid. - ImportError: If semantic module is not available. - """ - # Apply config defaults - embedding_backend, model_profile, use_gpu, endpoints, strategy, cooldown = \ - _apply_embedding_config_defaults( - embedding_backend, model_profile, use_gpu, endpoints, strategy, cooldown - ) - - # Calculate max_workers - max_workers = _calculate_max_workers(embedding_backend, endpoints, max_workers) - - backend_available, backend_error = is_embedding_backend_available(embedding_backend) - if not backend_available: - return {"success": False, "error": backend_error or "Embedding backend not available"} - - if not index_path.exists(): - return { - "success": False, - "error": f"Index not found: {index_path}", - } - - # Check existing chunks - status = check_index_embeddings(index_path) - if not status["success"]: - return status - - existing_chunks = status["result"]["total_chunks"] - - if existing_chunks > 0 and not force: - return { - "success": False, - "error": f"Index already has {existing_chunks} chunks. Use --force to regenerate.", - "existing_chunks": existing_chunks, - } - - if force and existing_chunks > 0: - if progress_callback: - progress_callback(f"Clearing {existing_chunks} existing chunks...") - - try: - with sqlite3.connect(index_path) as conn: - conn.execute("DELETE FROM semantic_chunks") - conn.commit() - except sqlite3.DatabaseError as e: - return { - "success": False, - "error": f"Database error clearing chunks: {str(e)}", - } - except Exception as e: - return { - "success": False, - "error": f"Failed to clear existing chunks: {str(e)}", - } - - # Initialize embedder and chunker using helper - try: - embedder, chunker, endpoint_count = _initialize_embedder_and_chunker( - embedding_backend, model_profile, use_gpu, endpoints, strategy, cooldown, - chunk_size, overlap - ) - - # Log embedder info with endpoint count for multi-endpoint mode - if progress_callback: - if endpoint_count > 1: - progress_callback(f"Using {endpoint_count} API endpoints with {strategy} strategy") - progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)") - - # Calculate dynamic batch size based on model capacity - from codexlens.config import Config - batch_config = Config.load() - effective_batch_size = calculate_dynamic_batch_size(batch_config, embedder) - - if progress_callback and batch_config.api_batch_size_dynamic: - progress_callback(f"Dynamic batch size: {effective_batch_size} (model max_tokens={getattr(embedder, 'max_tokens', 8192)})") - - except (ImportError, ValueError) as e: - # Missing dependency or invalid configuration - return { - "success": False, - "error": f"Failed to initialize embedding components: {str(e)}", - } - except Exception as e: - # Other unexpected errors - return { - "success": False, - "error": f"Unexpected error initializing components: {str(e)}", - } - - # --- STREAMING PROCESSING --- - # Process files in batches to control memory usage - start_time = time.time() - failed_files = [] - total_chunks_created = 0 - total_files_processed = 0 - FILE_BATCH_SIZE = 100 # Process 100 files at a time - # effective_batch_size is calculated above (dynamic or EMBEDDING_BATCH_SIZE fallback) - - try: - if VectorStore is None: - return { - "success": False, - "error": "Semantic search not available (VectorStore import failed). Install with: pip install codexlens[semantic]", - } - with VectorStore(index_path) as vector_store: - # Check model compatibility with existing embeddings - if not force: - is_compatible, warning = vector_store.check_model_compatibility( - model_profile, embedder.model_name, embedder.embedding_dim - ) - if not is_compatible: - return { - "success": False, - "error": warning, - } - - # Set/update model configuration for this index - vector_store.set_model_config( - model_profile, embedder.model_name, embedder.embedding_dim, backend=embedding_backend - ) - # Use bulk insert mode for efficient batch ANN index building - # This defers ANN updates until end_bulk_insert() is called - with vector_store.bulk_insert(): - with sqlite3.connect(index_path) as conn: - conn.row_factory = sqlite3.Row - path_column = _get_path_column(conn) - - # Get total file count for progress reporting - total_files = conn.execute("SELECT COUNT(*) FROM files").fetchone()[0] - if total_files == 0: - return {"success": False, "error": "No files found in index"} - - if progress_callback: - # Format must match Node.js parseProgressLine: "Processing N files" with 'embed' keyword - progress_callback(f"Processing {total_files} files for embeddings in batches of {FILE_BATCH_SIZE}...") - - cursor = conn.execute(f"SELECT {path_column}, content, language FROM files") - - # --- STREAMING GENERATOR APPROACH --- - # Instead of accumulating all chunks from 100 files, we use a generator - # that yields chunks on-demand, keeping memory usage low and constant. - chunk_generator = _generate_chunks_from_cursor( - cursor, chunker, path_column, FILE_BATCH_SIZE, failed_files - ) - - # Determine max tokens per batch - # Priority: explicit parameter > embedder.max_tokens > default 8000 - if max_tokens_per_batch is None: - max_tokens_per_batch = getattr(embedder, 'max_tokens', 8000) - - # Create token-aware batches or fall back to fixed-size batching - if max_tokens_per_batch: - batch_generator = _create_token_aware_batches( - chunk_generator, max_tokens_per_batch - ) - else: - # Fallback to fixed-size batching for backward compatibility - def fixed_size_batches(): - while True: - batch = list(islice(chunk_generator, effective_batch_size)) - if not batch: - break - yield batch - batch_generator = fixed_size_batches() - - batch_number = 0 - files_seen = set() - - def compute_embeddings_only(batch_data: Tuple[int, List[Tuple]]): - """Compute embeddings for a batch (no DB write) with retry logic. - - Args: - batch_data: Tuple of (batch_number, chunk_batch) - - Returns: - Tuple of (batch_num, chunk_batch, embeddings_numpy, batch_files, error) - """ - import random - - batch_num, chunk_batch = batch_data - batch_files = set() - for _, file_path in chunk_batch: - batch_files.add(file_path) - - max_retries = 5 - base_delay = 2.0 - - for attempt in range(max_retries + 1): - try: - batch_contents = [chunk.content for chunk, _ in chunk_batch] - embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=effective_batch_size) - return batch_num, chunk_batch, embeddings_numpy, batch_files, None - - except Exception as e: - error_str = str(e).lower() - # Check for retryable errors (rate limit, connection, backend issues) - # Note: Some backends (e.g., ModelScope) return 400 with nested 500 errors - is_retryable = any(x in error_str for x in [ - "429", "rate limit", "connection", "timeout", - "502", "503", "504", "service unavailable", - "500", "400", "badrequesterror", "internal server error", - "11434" # Ollama port - indicates backend routing issue - ]) - - if attempt < max_retries and is_retryable: - sleep_time = base_delay * (2 ** attempt) + random.uniform(0, 0.5) - logger.warning(f"Batch {batch_num} failed (attempt {attempt+1}/{max_retries+1}). " - f"Retrying in {sleep_time:.1f}s. Error: {e}") - time.sleep(sleep_time) - continue - - error_msg = f"Batch {batch_num}: {str(e)}" - logger.error(f"Failed to compute embeddings for batch {batch_num}: {str(e)}") - return batch_num, chunk_batch, None, batch_files, error_msg - - # Should not reach here, but just in case - return batch_num, chunk_batch, None, batch_files, f"Batch {batch_num}: Max retries exceeded" - - # Process batches based on max_workers setting - if max_workers <= 1: - # Sequential processing - stream directly from generator (no pre-materialization) - for chunk_batch in batch_generator: - batch_number += 1 - - # Track files in this batch - batch_files = set() - for _, file_path in chunk_batch: - batch_files.add(file_path) - - # Retry logic for transient backend errors - max_retries = 5 - base_delay = 2.0 - success = False - - for attempt in range(max_retries + 1): - try: - # Generate embeddings - batch_contents = [chunk.content for chunk, _ in chunk_batch] - embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=effective_batch_size) - - # Store embeddings with category - categories = _build_categories_from_batch(chunk_batch) - vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy, categories=categories) - - files_seen.update(batch_files) - total_chunks_created += len(chunk_batch) - total_files_processed = len(files_seen) - success = True - break - - except Exception as e: - error_str = str(e).lower() - # Check for retryable errors (rate limit, connection, backend issues) - is_retryable = any(x in error_str for x in [ - "429", "rate limit", "connection", "timeout", - "502", "503", "504", "service unavailable", - "500", "400", "badrequesterror", "internal server error", - "11434" # Ollama port - indicates backend routing issue - ]) - - if attempt < max_retries and is_retryable: - import random - sleep_time = base_delay * (2 ** attempt) + random.uniform(0, 0.5) - logger.warning(f"Batch {batch_number} failed (attempt {attempt+1}/{max_retries+1}). " - f"Retrying in {sleep_time:.1f}s. Error: {e}") - time.sleep(sleep_time) - continue - - logger.error(f"Failed to process batch {batch_number}: {str(e)}") - files_seen.update(batch_files) - break - - if success and progress_callback and batch_number % 10 == 0: - progress_callback(f" Batch {batch_number}: {total_chunks_created} chunks, {total_files_processed} files") - else: - # Concurrent processing - main thread iterates batches (SQLite safe), - # workers compute embeddings (parallel), main thread writes to DB (serial) - if progress_callback: - progress_callback(f"Processing with {max_workers} concurrent embedding workers...") - - with ThreadPoolExecutor(max_workers=max_workers) as executor: - pending_futures = {} # future -> (batch_num, chunk_batch) - completed_batches = 0 - last_reported_batch = 0 - - def process_completed_futures(): - """Process any completed futures and write to DB.""" - nonlocal total_chunks_created, total_files_processed, completed_batches, last_reported_batch - done_futures = [f for f in pending_futures if f.done()] - for f in done_futures: - try: - batch_num, chunk_batch, embeddings_numpy, batch_files, error = f.result() - if embeddings_numpy is not None and error is None: - # Write to DB in main thread (no contention) - categories = _build_categories_from_batch(chunk_batch) - vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy, categories=categories) - total_chunks_created += len(chunk_batch) - files_seen.update(batch_files) - total_files_processed = len(files_seen) - completed_batches += 1 - except Exception as e: - logger.error(f"Future raised exception: {e}") - completed_batches += 1 - del pending_futures[f] - - # Report progress based on completed batches (every 5 batches) - if progress_callback and completed_batches >= last_reported_batch + 5: - progress_callback(f" Batch {completed_batches}: {total_chunks_created} chunks, {total_files_processed} files") - last_reported_batch = completed_batches - - # Iterate batches in main thread (SQLite cursor is main-thread bound) - for chunk_batch in batch_generator: - batch_number += 1 - - # Submit compute task to worker pool - future = executor.submit(compute_embeddings_only, (batch_number, chunk_batch)) - pending_futures[future] = batch_number - - # Process any completed futures to free memory and write to DB - process_completed_futures() - - # Backpressure: wait if too many pending - while len(pending_futures) >= max_workers * 2: - process_completed_futures() - if len(pending_futures) >= max_workers * 2: - time.sleep(0.1) # time is imported at module level - - # Wait for remaining futures - for future in as_completed(list(pending_futures.keys())): - try: - batch_num, chunk_batch, embeddings_numpy, batch_files, error = future.result() - if embeddings_numpy is not None and error is None: - categories = _build_categories_from_batch(chunk_batch) - vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy, categories=categories) - total_chunks_created += len(chunk_batch) - files_seen.update(batch_files) - total_files_processed = len(files_seen) - completed_batches += 1 - - # Report progress for remaining batches - if progress_callback and completed_batches >= last_reported_batch + 5: - progress_callback(f" Batch {completed_batches}: {total_chunks_created} chunks, {total_files_processed} files") - last_reported_batch = completed_batches - except Exception as e: - logger.error(f"Future raised exception: {e}") - - # Notify before ANN index finalization (happens when bulk_insert context exits) - if progress_callback: - progress_callback(f"Finalizing index... Building ANN index for {total_chunks_created} chunks") - - except Exception as e: - # Cleanup on error to prevent process hanging - try: - _cleanup_fastembed_resources() - gc.collect() - except Exception as cleanup_exc: - logger.debug(f"Cleanup error during exception handling: {cleanup_exc}") - return {"success": False, "error": f"Failed to read or process files: {str(e)}"} - - elapsed_time = time.time() - start_time - - # Final cleanup: release ONNX resources to allow process exit - # This is critical - without it, ONNX Runtime threads prevent Python from exiting - try: - _cleanup_fastembed_resources() - gc.collect() - except Exception as cleanup_exc: - logger.debug(f"Cleanup error during finalization: {cleanup_exc}") - - return { - "success": True, - "result": { - "chunks_created": total_chunks_created, - "files_processed": total_files_processed, - "files_failed": len(failed_files), - "elapsed_time": elapsed_time, - "model_profile": model_profile, - "model_name": embedder.model_name, - "failed_files": failed_files[:5], # First 5 failures - "index_path": str(index_path), - }, - } - - -def _discover_index_dbs_internal(index_root: Path) -> List[Path]: - """Internal helper to find all _index.db files (no deprecation warning). - - Used internally by generate_dense_embeddings_centralized. - - Args: - index_root: Root directory to scan for _index.db files - - Returns: - Sorted list of paths to _index.db files - """ - if not index_root.exists(): - return [] - - return sorted(filter_index_paths(index_root.rglob("_index.db"), index_root)) - - -def build_centralized_binary_vectors_from_existing( - index_root: Path, - *, - force: bool = False, - embedding_dim: Optional[int] = None, - progress_callback: Optional[callable] = None, -) -> Dict[str, Any]: - """Build centralized binary vectors + metadata from existing semantic_chunks embeddings. - - This is a fast-path for enabling the staged binary coarse search without - regenerating embeddings (and without triggering global model locks). - - It scans all distributed `_index.db` files under `index_root`, reads - existing `semantic_chunks.embedding` blobs, assigns new global chunk_ids, - and writes: - - `/_binary_vectors.mmap` (+ `.meta.json`) - - `/_vectors_meta.db` (chunk_metadata + binary_vectors) - """ - from codexlens.config import BINARY_VECTORS_MMAP_NAME, VECTORS_META_DB_NAME - from codexlens.storage.vector_meta_store import VectorMetadataStore - - index_root = Path(index_root).resolve() - vectors_meta_path = index_root / VECTORS_META_DB_NAME - mmap_path = index_root / BINARY_VECTORS_MMAP_NAME - meta_path = mmap_path.with_suffix(".meta.json") - - index_files = _discover_index_dbs_internal(index_root) - if not index_files: - return {"success": False, "error": f"No _index.db files found under {index_root}"} - - if progress_callback: - progress_callback(f"Scanning {len(index_files)} index databases for existing embeddings...") - - # First pass: detect embedding dims present. - dims_seen: Dict[int, int] = {} - selected_config: Optional[Dict[str, Any]] = None - - for index_path in index_files: - try: - with sqlite3.connect(index_path) as conn: - conn.row_factory = sqlite3.Row - has_table = conn.execute( - "SELECT 1 FROM sqlite_master WHERE type='table' AND name='semantic_chunks'" - ).fetchone() - if not has_table: - continue - - dim_row = conn.execute( - "SELECT backend, model_profile, model_name, embedding_dim FROM embeddings_config WHERE id=1" - ).fetchone() - if dim_row and dim_row[3]: - dim_val = int(dim_row[3]) - dims_seen[dim_val] = dims_seen.get(dim_val, 0) + 1 - if selected_config is None: - selected_config = { - "backend": dim_row[0], - "model_profile": dim_row[1], - "model_name": dim_row[2], - "embedding_dim": dim_val, - } - - # We count per-dim later after selecting a target dim. - except (sqlite3.DatabaseError, ValueError, TypeError): - # Skip corrupted or malformed indexes - continue - - if not dims_seen: - return {"success": False, "error": "No embeddings_config found under index_root"} - - if embedding_dim is None: - # Default: pick the most common embedding dim across indexes. - embedding_dim = max(dims_seen.items(), key=lambda kv: kv[1])[0] - - embedding_dim = int(embedding_dim) - - if progress_callback and len(dims_seen) > 1: - progress_callback(f"Mixed embedding dims detected, selecting dim={embedding_dim} (seen={dims_seen})") - - # Re-detect the selected model config for this dim (do not reuse an arbitrary first-seen config). - selected_config = None - - # Second pass: count only chunks matching selected dim. - total_chunks = 0 - for index_path in index_files: - try: - with sqlite3.connect(index_path) as conn: - conn.row_factory = sqlite3.Row - has_table = conn.execute( - "SELECT 1 FROM sqlite_master WHERE type='table' AND name='semantic_chunks'" - ).fetchone() - if not has_table: - continue - - dim_row = conn.execute( - "SELECT backend, model_profile, model_name, embedding_dim FROM embeddings_config WHERE id=1" - ).fetchone() - dim_val = int(dim_row[3]) if dim_row and dim_row[3] else None - if dim_val != embedding_dim: - continue - - if selected_config is None: - selected_config = { - "backend": dim_row[0], - "model_profile": dim_row[1], - "model_name": dim_row[2], - "embedding_dim": dim_val, - } - - row = conn.execute( - "SELECT COUNT(*) FROM semantic_chunks WHERE embedding IS NOT NULL AND length(embedding) > 0" - ).fetchone() - total_chunks += int(row[0] if row else 0) - except (sqlite3.DatabaseError, ValueError, TypeError): - # Skip corrupted or malformed indexes - continue - - if not total_chunks: - return { - "success": False, - "error": f"No existing embeddings found for embedding_dim={embedding_dim}", - "dims_seen": dims_seen, - } - - if progress_callback: - progress_callback(f"Found {total_chunks} embedded chunks (dim={embedding_dim}). Building binary vectors...") - - # Prepare output files / DB. - try: - import numpy as np - except ImportError as exc: - return {"success": False, "error": f"numpy required to build binary vectors: {exc}"} - - store = VectorMetadataStore(vectors_meta_path) - store._ensure_schema() - - if force: - try: - store.clear() - except Exception: - pass - try: - store.clear_binary_vectors() - except Exception: - pass - try: - if mmap_path.exists(): - mmap_path.unlink() - except Exception: - pass - try: - if meta_path.exists(): - meta_path.unlink() - except Exception: - pass - - bytes_per_vec = (int(embedding_dim) + 7) // 8 - mmap = np.memmap( - str(mmap_path), - dtype=np.uint8, - mode="w+", - shape=(int(total_chunks), int(bytes_per_vec)), - ) - - chunk_ids: List[int] = [] - chunks_batch: List[Dict[str, Any]] = [] - bin_ids_batch: List[int] = [] - bin_vecs_batch: List[bytes] = [] - batch_limit = 500 - - global_id = 1 - write_idx = 0 - - skipped_indexes: Dict[str, int] = {} - for index_path in index_files: - try: - with sqlite3.connect(index_path) as conn: - conn.row_factory = sqlite3.Row - has_table = conn.execute( - "SELECT 1 FROM sqlite_master WHERE type='table' AND name='semantic_chunks'" - ).fetchone() - if not has_table: - continue - - dim_row = conn.execute( - "SELECT embedding_dim FROM embeddings_config WHERE id=1" - ).fetchone() - dim_val = int(dim_row[0]) if dim_row and dim_row[0] else None - if dim_val != embedding_dim: - skipped_indexes[str(index_path)] = dim_val or -1 - continue - - rows = conn.execute( - "SELECT file_path, content, embedding, metadata, category FROM semantic_chunks " - "WHERE embedding IS NOT NULL AND length(embedding) > 0" - ).fetchall() - - for row in rows: - emb = np.frombuffer(row["embedding"], dtype=np.float32) - if emb.size != int(embedding_dim): - continue - - packed = np.packbits((emb > 0).astype(np.uint8)) - if packed.size != bytes_per_vec: - continue - - mmap[write_idx] = packed - write_idx += 1 - - cid = global_id - global_id += 1 - chunk_ids.append(cid) - - meta_raw = row["metadata"] - meta_dict: Dict[str, Any] = {} - if meta_raw: - try: - meta_dict = json.loads(meta_raw) if isinstance(meta_raw, str) else dict(meta_raw) - except Exception: - meta_dict = {} - - chunks_batch.append( - { - "chunk_id": cid, - "file_path": row["file_path"], - "content": row["content"], - "start_line": meta_dict.get("start_line"), - "end_line": meta_dict.get("end_line"), - "category": row["category"], - "metadata": meta_dict, - "source_index_db": str(index_path), - } - ) - - bin_ids_batch.append(cid) - bin_vecs_batch.append(packed.tobytes()) - - if len(chunks_batch) >= batch_limit: - store.add_chunks(chunks_batch) - store.add_binary_vectors(bin_ids_batch, bin_vecs_batch) - chunks_batch = [] - bin_ids_batch = [] - bin_vecs_batch = [] - - except Exception: - continue - - if chunks_batch: - store.add_chunks(chunks_batch) - store.add_binary_vectors(bin_ids_batch, bin_vecs_batch) - - mmap.flush() - del mmap - - # If we skipped inconsistent vectors, truncate metadata to actual write count. - chunk_ids = chunk_ids[:write_idx] - - # Write sidecar metadata. - with open(meta_path, "w", encoding="utf-8") as f: - json.dump( - { - "shape": [int(write_idx), int(bytes_per_vec)], - "chunk_ids": chunk_ids, - "embedding_dim": int(embedding_dim), - "backend": (selected_config or {}).get("backend"), - "model_profile": (selected_config or {}).get("model_profile"), - "model_name": (selected_config or {}).get("model_name"), - }, - f, - ) - - if progress_callback: - progress_callback(f"Binary vectors ready: {mmap_path} (rows={write_idx})") - - return { - "success": True, - "result": { - "index_root": str(index_root), - "index_files_scanned": len(index_files), - "chunks_total": int(total_chunks), - "chunks_written": int(write_idx), - "embedding_dim": int(embedding_dim), - "bytes_per_vector": int(bytes_per_vec), - "skipped_indexes": len(skipped_indexes), - "vectors_meta_db": str(vectors_meta_path), - "binary_mmap": str(mmap_path), - "binary_meta_json": str(meta_path), - }, - } - - -def discover_all_index_dbs(index_root: Path) -> List[Path]: - """Recursively find all _index.db files in an index tree. - - .. deprecated:: - This function is deprecated. Use centralized indexing with - ``generate_dense_embeddings_centralized`` instead, which handles - index discovery internally. - - Args: - index_root: Root directory to scan for _index.db files - - Returns: - Sorted list of paths to _index.db files - """ - import warnings - warnings.warn( - "discover_all_index_dbs is deprecated. Use centralized indexing with " - "generate_dense_embeddings_centralized instead.", - DeprecationWarning, - stacklevel=2 - ) - return _discover_index_dbs_internal(index_root) - - -def find_all_indexes(scan_dir: Path) -> List[Path]: - """Find all _index.db files in directory tree. - - Args: - scan_dir: Directory to scan - - Returns: - List of paths to _index.db files - """ - if not scan_dir.exists(): - return [] - - return _discover_index_dbs_internal(scan_dir) - - - -def generate_embeddings_recursive( - index_root: Path, - embedding_backend: Optional[str] = None, - model_profile: Optional[str] = None, - force: bool = False, - chunk_size: int = 2000, - overlap: int = 200, - progress_callback: Optional[callable] = None, - use_gpu: Optional[bool] = None, - max_tokens_per_batch: Optional[int] = None, - max_workers: Optional[int] = None, - endpoints: Optional[List] = None, - strategy: Optional[str] = None, - cooldown: Optional[float] = None, -) -> Dict[str, any]: - """Generate embeddings for all index databases in a project recursively. - - .. deprecated:: - This function is deprecated. Use ``generate_dense_embeddings_centralized`` - instead, which creates a single centralized vector index for the entire project - rather than per-directory indexes. - - Args: - index_root: Root index directory containing _index.db files - embedding_backend: Embedding backend to use (fastembed or litellm). - Defaults to config setting. - model_profile: Model profile for fastembed (fast, code, multilingual, balanced) - or model name for litellm (e.g., qwen3-embedding). - Defaults to config setting. - force: If True, regenerate even if embeddings exist - chunk_size: Maximum chunk size in characters - overlap: Overlap size in characters for sliding window chunking (default: 200) - progress_callback: Optional callback for progress updates - use_gpu: Whether to use GPU acceleration (fastembed only). - Defaults to config setting. - max_tokens_per_batch: Maximum tokens per batch for token-aware batching. - If None, attempts to get from embedder.max_tokens, - then falls back to 8000. If set, overrides automatic detection. - max_workers: Maximum number of concurrent API calls. - If None, uses dynamic defaults based on backend and endpoint count. - endpoints: Optional list of endpoint configurations for multi-API load balancing. - strategy: Selection strategy for multi-endpoint mode. - cooldown: Default cooldown seconds for rate-limited endpoints. - - Returns: - Aggregated result dictionary with generation statistics - """ - import warnings - warnings.warn( - "generate_embeddings_recursive is deprecated. Use " - "generate_dense_embeddings_centralized instead for centralized indexing.", - DeprecationWarning, - stacklevel=2 - ) - - # Apply config defaults - embedding_backend, model_profile, use_gpu, endpoints, strategy, cooldown = \ - _apply_embedding_config_defaults( - embedding_backend, model_profile, use_gpu, endpoints, strategy, cooldown - ) - - # Calculate max_workers - max_workers = _calculate_max_workers(embedding_backend, endpoints, max_workers) - - # Discover all _index.db files (using internal helper to avoid double deprecation warning) - index_files = _discover_index_dbs_internal(index_root) - - if not index_files: - return { - "success": False, - "error": f"No index databases found in {index_root}", - } - - if progress_callback: - progress_callback(f"Found {len(index_files)} index databases to process") - - # Process each index database - all_results = [] - total_chunks = 0 - total_files_processed = 0 - total_files_failed = 0 - - for idx, index_path in enumerate(index_files, 1): - if progress_callback: - try: - rel_path = index_path.relative_to(index_root) - except ValueError: - rel_path = index_path - # Format: "Processing file X/Y: path" to match Node.js parseProgressLine - progress_callback(f"Processing file {idx}/{len(index_files)}: {rel_path}") - - result = generate_embeddings( - index_path, - embedding_backend=embedding_backend, - model_profile=model_profile, - force=force, - chunk_size=chunk_size, - overlap=overlap, - progress_callback=None, # Don't cascade callbacks - use_gpu=use_gpu, - max_tokens_per_batch=max_tokens_per_batch, - max_workers=max_workers, - endpoints=endpoints, - strategy=strategy, - cooldown=cooldown, - ) - - all_results.append({ - "path": str(index_path), - "success": result["success"], - "result": result.get("result"), - "error": result.get("error"), - }) - - if result["success"]: - data = result["result"] - total_chunks += data["chunks_created"] - total_files_processed += data["files_processed"] - total_files_failed += data["files_failed"] - - successful = sum(1 for r in all_results if r["success"]) - - # Final cleanup after processing all indexes - # Each generate_embeddings() call does its own cleanup, but do a final one to be safe - try: - _cleanup_fastembed_resources() - gc.collect() - except Exception: - pass - - return { - "success": successful > 0, - "result": { - "indexes_processed": len(index_files), - "indexes_successful": successful, - "indexes_failed": len(index_files) - successful, - "total_chunks_created": total_chunks, - "total_files_processed": total_files_processed, - "total_files_failed": total_files_failed, - "model_profile": model_profile, - "details": all_results, - }, - } - - -def generate_dense_embeddings_centralized( - index_root: Path, - embedding_backend: Optional[str] = None, - model_profile: Optional[str] = None, - force: bool = False, - chunk_size: int = 2000, - overlap: int = 200, - progress_callback: Optional[callable] = None, - use_gpu: Optional[bool] = None, - max_tokens_per_batch: Optional[int] = None, - max_workers: Optional[int] = None, - endpoints: Optional[List] = None, - strategy: Optional[str] = None, - cooldown: Optional[float] = None, -) -> Dict[str, any]: - """Generate dense embeddings with centralized vector storage. - - This function creates a single HNSW index at the project root instead of - per-directory indexes. All chunks from all _index.db files are combined - into one central _vectors.hnsw file. - - Target architecture: - / - |-- _vectors.hnsw # Centralized dense vector ANN index - |-- src/ - |-- _index.db # No longer contains .hnsw file - - Args: - index_root: Root index directory containing _index.db files - embedding_backend: Embedding backend (fastembed or litellm) - model_profile: Model profile or name - force: If True, regenerate even if embeddings exist - chunk_size: Maximum chunk size in characters - overlap: Overlap size in characters - progress_callback: Optional callback for progress updates - use_gpu: Whether to use GPU acceleration - max_tokens_per_batch: Maximum tokens per batch - max_workers: Maximum concurrent workers - endpoints: Multi-endpoint configurations - strategy: Endpoint selection strategy - cooldown: Rate-limit cooldown seconds - - Returns: - Result dictionary with generation statistics - """ - from codexlens.config import VECTORS_HNSW_NAME - - # Apply config defaults - embedding_backend, model_profile, use_gpu, endpoints, strategy, cooldown = \ - _apply_embedding_config_defaults( - embedding_backend, model_profile, use_gpu, endpoints, strategy, cooldown - ) - - # Calculate max_workers - max_workers = _calculate_max_workers(embedding_backend, endpoints, max_workers) - - backend_available, backend_error = is_embedding_backend_available(embedding_backend) - if not backend_available: - return {"success": False, "error": backend_error or "Embedding backend not available"} - - # Discover all _index.db files - index_files = _discover_index_dbs_internal(index_root) - - if not index_files: - return { - "success": False, - "error": f"No index databases found in {index_root}", - } - - if progress_callback: - progress_callback(f"Found {len(index_files)} index databases for centralized embedding") - - # Pre-calculate estimated chunk count for HNSW capacity - # This avoids expensive resize operations during indexing - estimated_total_files = 0 - for index_path in index_files: - try: - with sqlite3.connect(index_path) as conn: - cursor = conn.execute("SELECT COUNT(*) FROM files") - estimated_total_files += cursor.fetchone()[0] - except Exception: - pass - # Heuristic: ~15 chunks per file on average - estimated_chunks = max(100000, estimated_total_files * 15) - - if progress_callback: - progress_callback(f"Estimated {estimated_total_files} files, ~{estimated_chunks} chunks") - - # Check for existing centralized index - central_hnsw_path = index_root / VECTORS_HNSW_NAME - if central_hnsw_path.exists() and not force: - return { - "success": False, - "error": f"Centralized vector index already exists at {central_hnsw_path}. Use --force to regenerate.", - } - - # Initialize embedder and chunker using helper - try: - from codexlens.semantic.ann_index import ANNIndex - - embedder, chunker, endpoint_count = _initialize_embedder_and_chunker( - embedding_backend, model_profile, use_gpu, endpoints, strategy, cooldown, - chunk_size, overlap - ) - - # Load chunk stripping config for batch size calculation - from codexlens.config import Config - batch_config = Config.load() - - if progress_callback: - if endpoint_count > 1: - progress_callback(f"Using {endpoint_count} API endpoints with {strategy} strategy") - progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)") - - # Calculate dynamic batch size based on model capacity - effective_batch_size = calculate_dynamic_batch_size(batch_config, embedder) - - if progress_callback and batch_config.api_batch_size_dynamic: - progress_callback(f"Dynamic batch size: {effective_batch_size} (model max_tokens={getattr(embedder, 'max_tokens', 8192)})") - - except Exception as e: - return { - "success": False, - "error": f"Failed to initialize components: {str(e)}", - } - - # Create centralized ANN index with pre-calculated capacity - # Using estimated_chunks avoids expensive resize operations during indexing - central_ann_index = ANNIndex.create_central( - index_root=index_root, - dim=embedder.embedding_dim, - initial_capacity=estimated_chunks, - auto_save=False, - ) - - # Process all index databases - start_time = time.time() - failed_files = [] - total_chunks_created = 0 - total_files_processed = 0 - all_chunk_ids = [] - all_embeddings = [] - - # Track chunk ID to file_path mapping for metadata - chunk_id_to_info: Dict[int, Dict[str, Any]] = {} - next_chunk_id = 1 - # Track current index_path for source_index_db field - current_index_path: Optional[str] = None - - for idx, index_path in enumerate(index_files, 1): - if progress_callback: - try: - rel_path = index_path.relative_to(index_root) - except ValueError: - rel_path = index_path - progress_callback(f"Processing {idx}/{len(index_files)}: {rel_path}") - - # Track current index_path for source_index_db - current_index_path = str(index_path) - - try: - with sqlite3.connect(index_path) as conn: - conn.row_factory = sqlite3.Row - path_column = _get_path_column(conn) - - # Get files from this index - cursor = conn.execute(f"SELECT {path_column}, content, language FROM files") - file_rows = cursor.fetchall() - - for file_row in file_rows: - file_path = file_row[path_column] - content = file_row["content"] - language = file_row["language"] or "python" - - try: - chunks = chunker.chunk_sliding_window( - content, - file_path=file_path, - language=language - ) - - if not chunks: - continue - - total_files_processed += 1 - - # Generate embeddings for this file's chunks - batch_contents = [chunk.content for chunk in chunks] - embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=effective_batch_size) - - # Assign chunk IDs and store embeddings - for i, chunk in enumerate(chunks): - chunk_id = next_chunk_id - next_chunk_id += 1 - - all_chunk_ids.append(chunk_id) - all_embeddings.append(embeddings_numpy[i]) - - # Store metadata for later retrieval - chunk_id_to_info[chunk_id] = { - "file_path": file_path, - "content": chunk.content, - "metadata": chunk.metadata, - "category": get_file_category(file_path) or "code", - "source_index_db": current_index_path, - } - total_chunks_created += 1 - - except Exception as e: - logger.error(f"Failed to process {file_path}: {e}") - failed_files.append((file_path, str(e))) - - except Exception as e: - logger.error(f"Failed to read index {index_path}: {e}") - failed_files.append((str(index_path), str(e))) - - # Add all embeddings to centralized ANN index - if all_embeddings: - if progress_callback: - progress_callback(f"Building centralized ANN index with {len(all_embeddings)} vectors...") - - try: - import numpy as np - embeddings_matrix = np.vstack(all_embeddings) - central_ann_index.add_vectors(all_chunk_ids, embeddings_matrix) - central_ann_index.save() - - if progress_callback: - progress_callback(f"Saved centralized index to {central_hnsw_path}") - - except Exception as e: - return { - "success": False, - "error": f"Failed to build centralized ANN index: {str(e)}", - } - - # Store chunk metadata in a centralized metadata database - vectors_meta_path = index_root / VECTORS_META_DB_NAME - if chunk_id_to_info: - if progress_callback: - progress_callback(f"Storing {len(chunk_id_to_info)} chunk metadata records...") - - try: - from codexlens.storage.vector_meta_store import VectorMetadataStore - - with VectorMetadataStore(vectors_meta_path) as meta_store: - # Convert chunk_id_to_info dict to list of dicts for batch insert - chunks_to_store = [] - for cid, info in chunk_id_to_info.items(): - metadata = info.get("metadata", {}) - chunks_to_store.append({ - "chunk_id": cid, - "file_path": info["file_path"], - "content": info["content"], - "start_line": metadata.get("start_line"), - "end_line": metadata.get("end_line"), - "category": info.get("category"), - "metadata": metadata, - "source_index_db": info.get("source_index_db"), - }) - - meta_store.add_chunks(chunks_to_store) - - if progress_callback: - progress_callback(f"Saved metadata to {vectors_meta_path}") - - except Exception as e: - logger.warning("Failed to store vector metadata: %s", e) - # Non-fatal: continue without centralized metadata - - # --- Binary Vector Generation for Cascade Search (Memory-Mapped) --- - binary_success = False - binary_count = 0 - try: - from codexlens.config import Config, BINARY_VECTORS_MMAP_NAME - config = Config.load() - - if getattr(config, 'enable_binary_cascade', True) and all_embeddings: - import numpy as np - - if progress_callback: - progress_callback(f"Generating binary vectors for {len(all_embeddings)} chunks...") - - # Binarize dense vectors: sign(x) -> 1 if x > 0, 0 otherwise - # Pack into bytes for efficient storage and Hamming distance computation - embeddings_matrix = np.vstack(all_embeddings) - binary_matrix = (embeddings_matrix > 0).astype(np.uint8) - - # Pack bits into bytes (8 bits per byte) - vectorized for all rows - packed_matrix = np.packbits(binary_matrix, axis=1) - binary_count = len(packed_matrix) - - # Save as memory-mapped file for efficient loading - binary_mmap_path = index_root / BINARY_VECTORS_MMAP_NAME - mmap_array = np.memmap( - str(binary_mmap_path), - dtype=np.uint8, - mode='w+', - shape=packed_matrix.shape - ) - mmap_array[:] = packed_matrix - mmap_array.flush() - del mmap_array # Close the memmap - - # Save metadata (shape and chunk_ids) to sidecar JSON - import json - meta_path = binary_mmap_path.with_suffix('.meta.json') - with open(meta_path, 'w') as f: - json.dump({ - 'shape': list(packed_matrix.shape), - 'chunk_ids': all_chunk_ids, - 'embedding_dim': embeddings_matrix.shape[1], - }, f) - - # Also store in DB for backward compatibility - from codexlens.storage.vector_meta_store import VectorMetadataStore - binary_packed_bytes = [row.tobytes() for row in packed_matrix] - with VectorMetadataStore(vectors_meta_path) as meta_store: - meta_store.add_binary_vectors(all_chunk_ids, binary_packed_bytes) - - binary_success = True - if progress_callback: - progress_callback(f"Generated {binary_count} binary vectors ({embeddings_matrix.shape[1]} dims -> {packed_matrix.shape[1]} bytes, mmap: {binary_mmap_path.name})") - - except Exception as e: - logger.warning("Binary vector generation failed: %s", e) - # Non-fatal: continue without binary vectors - - elapsed_time = time.time() - start_time - - # Cleanup - try: - _cleanup_fastembed_resources() - gc.collect() - except Exception: - pass - - return { - "success": True, - "result": { - "chunks_created": total_chunks_created, - "files_processed": total_files_processed, - "files_failed": len(failed_files), - "elapsed_time": elapsed_time, - "model_profile": model_profile, - "model_name": embedder.model_name, - "central_index_path": str(central_hnsw_path), - "failed_files": failed_files[:5], - "binary_success": binary_success, - "binary_count": binary_count, - }, - } - - -def get_embeddings_status(index_root: Path) -> Dict[str, any]: - """Get comprehensive embeddings coverage status for all indexes. - - Args: - index_root: Root index directory - - Returns: - Aggregated status with coverage statistics, model info, and timestamps - """ - index_files = _discover_index_dbs_internal(index_root) - centralized = _inspect_centralized_embeddings(index_root) - root_index_path = index_root / "_index.db" - root_index_exists = root_index_path.exists() - - if not index_files: - root_result = { - "index_path": str(root_index_path), - "exists": root_index_exists, - "total_files": 0, - "files_with_embeddings": 0, - "files_without_embeddings": 0, - "total_chunks": 0, - "coverage_percent": 0.0, - "has_embeddings": False, - "storage_mode": "none", - } - subtree_result = { - "total_indexes": 0, - "total_files": 0, - "files_with_embeddings": 0, - "files_without_embeddings": 0, - "total_chunks": 0, - "coverage_percent": 0.0, - "indexes_with_embeddings": 0, - "indexes_without_embeddings": 0, - } - return { - "success": True, - "result": { - "total_indexes": 0, - "total_files": 0, - "files_with_embeddings": 0, - "files_without_embeddings": 0, - "total_chunks": 0, - "coverage_percent": 0.0, - "indexes_with_embeddings": 0, - "indexes_without_embeddings": 0, - "model_info": None, - "root": root_result, - "subtree": subtree_result, - "centralized": centralized, - }, - } - - subtree_total_files = 0 - subtree_files_with_embeddings = 0 - subtree_total_chunks = 0 - subtree_indexes_with_embeddings = 0 - subtree_model_info = None - latest_updated_at = None - - for index_path in index_files: - status = check_index_embeddings(index_path) - if not status["success"]: - continue - - result = status["result"] - subtree_total_files += result["total_files"] - subtree_files_with_embeddings += result["files_with_chunks"] - subtree_total_chunks += result["total_chunks"] - - if not result["has_embeddings"]: - continue - - subtree_indexes_with_embeddings += 1 - candidate_model_info = _get_model_info_from_index(index_path) - if not candidate_model_info: - continue - if subtree_model_info is None: - subtree_model_info = candidate_model_info - latest_updated_at = candidate_model_info.get("updated_at") - continue - candidate_updated_at = candidate_model_info.get("updated_at") - if candidate_updated_at and (latest_updated_at is None or candidate_updated_at > latest_updated_at): - latest_updated_at = candidate_updated_at - - if subtree_model_info and latest_updated_at: - subtree_model_info["updated_at"] = latest_updated_at - - root_total_files = 0 - root_files_with_embeddings = 0 - root_total_chunks = 0 - root_has_embeddings = False - root_storage_mode = "none" - - if root_index_exists: - root_status = check_index_embeddings(root_index_path) - if root_status["success"]: - root_data = root_status["result"] - root_total_files = int(root_data["total_files"]) - if root_data["has_embeddings"]: - root_files_with_embeddings = int(root_data["files_with_chunks"]) - root_total_chunks = int(root_data["total_chunks"]) - root_has_embeddings = True - root_storage_mode = "distributed" - - if centralized["usable"]: - root_files_with_embeddings = int(centralized["files_with_embeddings"]) - root_total_chunks = int(centralized["chunk_metadata_rows"]) - root_has_embeddings = True - root_storage_mode = "centralized" if root_storage_mode == "none" else "mixed" - - model_info = None - if root_has_embeddings: - if root_storage_mode in {"distributed", "mixed"} and root_index_exists: - model_info = _get_model_info_from_index(root_index_path) - if model_info is None and root_storage_mode in {"centralized", "mixed"}: - model_info = subtree_model_info - - root_coverage_percent = round( - (root_files_with_embeddings / root_total_files * 100) if root_total_files > 0 else 0, - 1, - ) - root_files_without_embeddings = max(root_total_files - root_files_with_embeddings, 0) - - root_result = { - "index_path": str(root_index_path), - "exists": root_index_exists, - "total_files": root_total_files, - "files_with_embeddings": root_files_with_embeddings, - "files_without_embeddings": root_files_without_embeddings, - "total_chunks": root_total_chunks, - "coverage_percent": root_coverage_percent, - "has_embeddings": root_has_embeddings, - "storage_mode": root_storage_mode, - } - subtree_result = { - "total_indexes": len(index_files), - "total_files": subtree_total_files, - "files_with_embeddings": subtree_files_with_embeddings, - "files_without_embeddings": subtree_total_files - subtree_files_with_embeddings, - "total_chunks": subtree_total_chunks, - "coverage_percent": round( - (subtree_files_with_embeddings / subtree_total_files * 100) if subtree_total_files > 0 else 0, - 1, - ), - "indexes_with_embeddings": subtree_indexes_with_embeddings, - "indexes_without_embeddings": len(index_files) - subtree_indexes_with_embeddings, - } - - return { - "success": True, - "result": { - "total_indexes": 1 if root_index_exists else 0, - "total_files": root_total_files, - "files_with_embeddings": root_files_with_embeddings, - "files_without_embeddings": root_files_without_embeddings, - "total_chunks": root_total_chunks, - "coverage_percent": root_coverage_percent, - "indexes_with_embeddings": 1 if root_has_embeddings else 0, - "indexes_without_embeddings": 1 if root_index_exists and not root_has_embeddings else 0, - "model_info": model_info, - "root": root_result, - "subtree": subtree_result, - "centralized": centralized, - }, - } - - -def get_embedding_stats_summary(index_root: Path) -> Dict[str, any]: - """Get summary statistics for all indexes in root directory. - - Args: - index_root: Root directory containing indexes - - Returns: - Summary statistics for all indexes - """ - indexes = find_all_indexes(index_root) - - if not indexes: - return { - "success": True, - "result": { - "total_indexes": 0, - "indexes_with_embeddings": 0, - "total_chunks": 0, - "indexes": [], - }, - } - - total_chunks = 0 - indexes_with_embeddings = 0 - index_stats = [] - - for index_path in indexes: - status = check_index_embeddings(index_path) - - if status["success"]: - result = status["result"] - has_emb = result["has_embeddings"] - chunks = result["total_chunks"] - - if has_emb: - indexes_with_embeddings += 1 - total_chunks += chunks - - # Extract project name from path - project_name = index_path.parent.name - - index_stats.append({ - "project": project_name, - "path": str(index_path), - "has_embeddings": has_emb, - "total_chunks": chunks, - "total_files": result["total_files"], - "coverage_percent": result.get("coverage_percent", 0), - }) - - return { - "success": True, - "result": { - "total_indexes": len(indexes), - "indexes_with_embeddings": indexes_with_embeddings, - "total_chunks": total_chunks, - "indexes": index_stats, - }, - } - - -def scan_for_model_conflicts( - index_root: Path, - target_backend: str, - target_model: str, -) -> Dict[str, any]: - """Scan for model conflicts across all indexes in a directory. - - Checks if any existing embeddings were generated with a different - backend or model than the target configuration. - - Args: - index_root: Root index directory to scan - target_backend: Target embedding backend (fastembed or litellm) - target_model: Target model profile/name - - Returns: - Dictionary with: - - has_conflict: True if any index has different model config - - existing_config: Config from first index with embeddings (if any) - - target_config: The requested configuration - - conflicts: List of conflicting index paths with their configs - - indexes_with_embeddings: Count of indexes that have embeddings - """ - index_files = _discover_index_dbs_internal(index_root) - - if not index_files: - return { - "has_conflict": False, - "existing_config": None, - "target_config": {"backend": target_backend, "model": target_model}, - "conflicts": [], - "indexes_with_embeddings": 0, - } - - conflicts = [] - existing_config = None - indexes_with_embeddings = 0 - - for index_path in index_files: - try: - from codexlens.semantic.vector_store import VectorStore - - with VectorStore(index_path) as vs: - config = vs.get_model_config() - if config and config.get("model_profile"): - indexes_with_embeddings += 1 - - # Store first existing config as reference - if existing_config is None: - existing_config = { - "backend": config.get("backend"), - "model": config.get("model_profile"), - "model_name": config.get("model_name"), - "embedding_dim": config.get("embedding_dim"), - } - - # Check for conflict: different backend OR different model - existing_backend = config.get("backend", "") - existing_model = config.get("model_profile", "") - - if existing_backend != target_backend or existing_model != target_model: - conflicts.append({ - "path": str(index_path), - "existing": { - "backend": existing_backend, - "model": existing_model, - "model_name": config.get("model_name"), - }, - }) - except Exception as e: - logger.debug(f"Failed to check model config for {index_path}: {e}") - continue - - return { - "has_conflict": len(conflicts) > 0, - "existing_config": existing_config, - "target_config": {"backend": target_backend, "model": target_model}, - "conflicts": conflicts, - "indexes_with_embeddings": indexes_with_embeddings, - } - - -def _get_global_settings_path() -> Path: - """Get the path to global embedding settings file.""" - return Path.home() / ".codexlens" / "embedding_lock.json" - - -def get_locked_model_config() -> Optional[Dict[str, Any]]: - """Get the globally locked embedding model configuration. - - Returns: - Dictionary with backend and model if locked, None otherwise. - """ - settings_path = _get_global_settings_path() - if not settings_path.exists(): - return None - - try: - with open(settings_path, "r", encoding="utf-8") as f: - data = json.load(f) - if data.get("locked"): - return { - "backend": data.get("backend"), - "model": data.get("model"), - "locked_at": data.get("locked_at"), - } - except (json.JSONDecodeError, OSError): - pass - - return None - - -def set_locked_model_config(backend: str, model: str) -> None: - """Set the globally locked embedding model configuration. - - This is called after the first successful embedding generation - to lock the model for all future operations. - - Args: - backend: Embedding backend (fastembed or litellm) - model: Model profile/name - """ - import datetime - - settings_path = _get_global_settings_path() - settings_path.parent.mkdir(parents=True, exist_ok=True) - - data = { - "locked": True, - "backend": backend, - "model": model, - "locked_at": datetime.datetime.now().isoformat(), - } - - with open(settings_path, "w", encoding="utf-8") as f: - json.dump(data, f, indent=2) - - -def clear_locked_model_config() -> bool: - """Clear the globally locked embedding model configuration. - - Returns: - True if lock was cleared, False if no lock existed. - """ - settings_path = _get_global_settings_path() - if settings_path.exists(): - settings_path.unlink() - return True - return False - - -def check_global_model_lock( - target_backend: str, - target_model: str, -) -> Dict[str, Any]: - """Check if the target model conflicts with the global lock. - - Args: - target_backend: Requested embedding backend - target_model: Requested model profile/name - - Returns: - Dictionary with: - - is_locked: True if a global lock exists - - has_conflict: True if target differs from locked config - - locked_config: The locked configuration (if any) - - target_config: The requested configuration - """ - locked_config = get_locked_model_config() - - if locked_config is None: - return { - "is_locked": False, - "has_conflict": False, - "locked_config": None, - "target_config": {"backend": target_backend, "model": target_model}, - } - - has_conflict = ( - locked_config["backend"] != target_backend or - locked_config["model"] != target_model - ) - - return { - "is_locked": True, - "has_conflict": has_conflict, - "locked_config": locked_config, - "target_config": {"backend": target_backend, "model": target_model}, - } diff --git a/codex-lens/src/codexlens/cli/model_manager.py b/codex-lens/src/codexlens/cli/model_manager.py deleted file mode 100644 index 15776cf1..00000000 --- a/codex-lens/src/codexlens/cli/model_manager.py +++ /dev/null @@ -1,1026 +0,0 @@ -"""Model Manager - Manage fastembed models for semantic search.""" - -import json -import os -import shutil -from pathlib import Path -from typing import Dict, List, Optional - -try: - from huggingface_hub import snapshot_download, list_repo_files - HUGGINGFACE_HUB_AVAILABLE = True -except ImportError: - HUGGINGFACE_HUB_AVAILABLE = False - -try: - from fastembed import TextEmbedding - FASTEMBED_AVAILABLE = True -except ImportError: - FASTEMBED_AVAILABLE = False - -try: - # fastembed >= 0.4.0 moved TextCrossEncoder to rerank.cross_encoder - from fastembed.rerank.cross_encoder import TextCrossEncoder - RERANKER_AVAILABLE = True -except ImportError: - try: - # Fallback for older versions - from fastembed import TextCrossEncoder - RERANKER_AVAILABLE = True - except ImportError: - RERANKER_AVAILABLE = False - - -# Reranker model profiles with metadata -# Note: fastembed TextCrossEncoder uses ONNX models from HuggingFace -RERANKER_MODEL_PROFILES = { - "ms-marco-mini": { - "model_name": "Xenova/ms-marco-MiniLM-L-6-v2", - "cache_name": "Xenova/ms-marco-MiniLM-L-6-v2", - "size_mb": 90, - "description": "Fast, lightweight reranker (default)", - "use_case": "Quick prototyping, resource-constrained environments", - "recommended": True, - }, - "ms-marco-12": { - "model_name": "Xenova/ms-marco-MiniLM-L-12-v2", - "cache_name": "Xenova/ms-marco-MiniLM-L-12-v2", - "size_mb": 130, - "description": "Better quality, 12-layer MiniLM", - "use_case": "General purpose reranking with better accuracy", - "recommended": True, - }, - "bge-base": { - "model_name": "BAAI/bge-reranker-base", - "cache_name": "BAAI/bge-reranker-base", - "size_mb": 280, - "description": "BGE reranker base model", - "use_case": "High-quality reranking for production", - "recommended": True, - }, - "bge-large": { - "model_name": "BAAI/bge-reranker-large", - "cache_name": "BAAI/bge-reranker-large", - "size_mb": 560, - "description": "BGE reranker large model (high resource usage)", - "use_case": "Maximum quality reranking", - "recommended": False, - }, - "jina-tiny": { - "model_name": "jinaai/jina-reranker-v1-tiny-en", - "cache_name": "jinaai/jina-reranker-v1-tiny-en", - "size_mb": 70, - "description": "Jina tiny reranker, very fast", - "use_case": "Ultra-low latency applications", - "recommended": True, - }, - "jina-turbo": { - "model_name": "jinaai/jina-reranker-v1-turbo-en", - "cache_name": "jinaai/jina-reranker-v1-turbo-en", - "size_mb": 150, - "description": "Jina turbo reranker, balanced", - "use_case": "Fast reranking with good accuracy", - "recommended": True, - }, - # Additional reranker models (commonly used) - "bge-reranker-v2-m3": { - "model_name": "BAAI/bge-reranker-v2-m3", - "cache_name": "BAAI/bge-reranker-v2-m3", - "size_mb": 560, - "description": "BGE v2 M3 reranker, multilingual", - "use_case": "Multilingual reranking, latest BGE version", - "recommended": True, - }, - "bge-reranker-v2-gemma": { - "model_name": "BAAI/bge-reranker-v2-gemma", - "cache_name": "BAAI/bge-reranker-v2-gemma", - "size_mb": 2000, - "description": "BGE v2 Gemma reranker, best quality", - "use_case": "Maximum quality with Gemma backbone", - "recommended": False, - }, - "cross-encoder-ms-marco": { - "model_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", - "cache_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", - "size_mb": 90, - "description": "Original cross-encoder MS MARCO", - "use_case": "Classic cross-encoder baseline", - "recommended": False, - }, -} - - -# Model profiles with metadata -# Note: 768d is max recommended dimension for optimal performance/quality balance -# 1024d models are available but not recommended due to higher resource usage -# cache_name: The actual Hugging Face repo name used by fastembed for ONNX caching -MODEL_PROFILES = { - "fast": { - "model_name": "BAAI/bge-small-en-v1.5", - "cache_name": "qdrant/bge-small-en-v1.5-onnx-q", # fastembed uses ONNX version - "dimensions": 384, - "size_mb": 80, - "description": "Fast, lightweight, English-optimized", - "use_case": "Quick prototyping, resource-constrained environments", - "recommended": True, - }, - "base": { - "model_name": "BAAI/bge-base-en-v1.5", - "cache_name": "qdrant/bge-base-en-v1.5-onnx-q", # fastembed uses ONNX version - "dimensions": 768, - "size_mb": 220, - "description": "General purpose, good balance of speed and quality", - "use_case": "General text search, documentation", - "recommended": True, - }, - "code": { - "model_name": "jinaai/jina-embeddings-v2-base-code", - "cache_name": "jinaai/jina-embeddings-v2-base-code", # Uses original name - "dimensions": 768, - "size_mb": 150, - "description": "Code-optimized, best for programming languages", - "use_case": "Open source projects, code semantic search", - "recommended": True, - }, - "minilm": { - "model_name": "sentence-transformers/all-MiniLM-L6-v2", - "cache_name": "qdrant/all-MiniLM-L6-v2-onnx", # fastembed uses ONNX version - "dimensions": 384, - "size_mb": 90, - "description": "Popular lightweight model, good quality", - "use_case": "General purpose, low resource environments", - "recommended": True, - }, - "multilingual": { - "model_name": "intfloat/multilingual-e5-large", - "cache_name": "qdrant/multilingual-e5-large-onnx", # fastembed uses ONNX version - "dimensions": 1024, - "size_mb": 1000, - "description": "Multilingual + code support (high resource usage)", - "use_case": "Enterprise multilingual projects", - "recommended": False, # 1024d not recommended - }, - "balanced": { - "model_name": "mixedbread-ai/mxbai-embed-large-v1", - "cache_name": "mixedbread-ai/mxbai-embed-large-v1", # Uses original name - "dimensions": 1024, - "size_mb": 600, - "description": "High accuracy, general purpose (high resource usage)", - "use_case": "High-quality semantic search, balanced performance", - "recommended": False, # 1024d not recommended - }, - # Additional embedding models (commonly used) - "bge-large": { - "model_name": "BAAI/bge-large-en-v1.5", - "cache_name": "qdrant/bge-large-en-v1.5-onnx-q", - "dimensions": 1024, - "size_mb": 650, - "description": "BGE large model, highest quality", - "use_case": "Maximum quality semantic search", - "recommended": False, - }, - "e5-small": { - "model_name": "intfloat/e5-small-v2", - "cache_name": "qdrant/e5-small-v2-onnx", - "dimensions": 384, - "size_mb": 80, - "description": "E5 small model, fast and lightweight", - "use_case": "Low latency applications", - "recommended": True, - }, - "e5-base": { - "model_name": "intfloat/e5-base-v2", - "cache_name": "qdrant/e5-base-v2-onnx", - "dimensions": 768, - "size_mb": 220, - "description": "E5 base model, balanced", - "use_case": "General purpose semantic search", - "recommended": True, - }, - "e5-large": { - "model_name": "intfloat/e5-large-v2", - "cache_name": "qdrant/e5-large-v2-onnx", - "dimensions": 1024, - "size_mb": 650, - "description": "E5 large model, high quality", - "use_case": "High quality semantic search", - "recommended": False, - }, - "jina-base-en": { - "model_name": "jinaai/jina-embeddings-v2-base-en", - "cache_name": "jinaai/jina-embeddings-v2-base-en", - "dimensions": 768, - "size_mb": 150, - "description": "Jina base English model", - "use_case": "English text semantic search", - "recommended": True, - }, - "jina-small-en": { - "model_name": "jinaai/jina-embeddings-v2-small-en", - "cache_name": "jinaai/jina-embeddings-v2-small-en", - "dimensions": 512, - "size_mb": 60, - "description": "Jina small English model, very fast", - "use_case": "Low latency English text search", - "recommended": True, - }, - "snowflake-arctic": { - "model_name": "Snowflake/snowflake-arctic-embed-m", - "cache_name": "Snowflake/snowflake-arctic-embed-m", - "dimensions": 768, - "size_mb": 220, - "description": "Snowflake Arctic embedding model", - "use_case": "Enterprise semantic search, high quality", - "recommended": True, - }, - "nomic-embed": { - "model_name": "nomic-ai/nomic-embed-text-v1.5", - "cache_name": "nomic-ai/nomic-embed-text-v1.5", - "dimensions": 768, - "size_mb": 280, - "description": "Nomic embedding model, open source", - "use_case": "Open source text embedding", - "recommended": True, - }, - "gte-small": { - "model_name": "thenlper/gte-small", - "cache_name": "thenlper/gte-small", - "dimensions": 384, - "size_mb": 70, - "description": "GTE small model, fast", - "use_case": "Fast text embedding", - "recommended": True, - }, - "gte-base": { - "model_name": "thenlper/gte-base", - "cache_name": "thenlper/gte-base", - "dimensions": 768, - "size_mb": 220, - "description": "GTE base model, balanced", - "use_case": "General purpose text embedding", - "recommended": True, - }, - "gte-large": { - "model_name": "thenlper/gte-large", - "cache_name": "thenlper/gte-large", - "dimensions": 1024, - "size_mb": 650, - "description": "GTE large model, high quality", - "use_case": "High quality text embedding", - "recommended": False, - }, -} - - -def get_cache_dir() -> Path: - """Get fastembed cache directory. - - Returns: - Path to cache directory (~/.cache/huggingface or custom path) - """ - # Check HF_HOME environment variable first - if "HF_HOME" in os.environ: - return Path(os.environ["HF_HOME"]) - - # fastembed 0.7.4+ uses HuggingFace cache when cache_dir is specified - # Models are stored directly under the cache directory - return Path.home() / ".cache" / "huggingface" - - -def _get_model_cache_path(cache_dir: Path, info: Dict) -> Path: - """Get the actual cache path for a model. - - fastembed 0.7.4+ uses HuggingFace Hub's naming convention: - - Prefix: 'models--' - - Replace '/' with '--' in model name - Example: jinaai/jina-embeddings-v2-base-code - -> models--jinaai--jina-embeddings-v2-base-code - - Args: - cache_dir: The fastembed cache directory (HuggingFace hub path) - info: Model profile info dictionary - - Returns: - Path to the model cache directory - """ - # HuggingFace Hub naming: models--{org}--{model} - # Use cache_name if available (for mapped ONNX models), else model_name - target_name = info.get("cache_name", info["model_name"]) - sanitized_name = f"models--{target_name.replace('/', '--')}" - return cache_dir / sanitized_name - - -def scan_discovered_models(model_type: str = "embedding") -> List[Dict]: - """Scan cache directory for manually placed models not in predefined profiles. - - This allows users to manually download models (e.g., via huggingface-cli or - by copying the model directory) and have them recognized automatically. - - Args: - model_type: Type of models to scan for ("embedding" or "reranker") - - Returns: - List of discovered model info dictionaries - """ - cache_dir = get_cache_dir() - if not cache_dir.exists(): - return [] - - # Get known model cache names based on type - if model_type == "reranker": - known_cache_names = { - f"models--{info.get('cache_name', info['model_name']).replace('/', '--')}" - for info in RERANKER_MODEL_PROFILES.values() - } - else: - known_cache_names = { - f"models--{info.get('cache_name', info['model_name']).replace('/', '--')}" - for info in MODEL_PROFILES.values() - } - - discovered = [] - - # Scan for model directories in cache - for item in cache_dir.iterdir(): - if not item.is_dir() or not item.name.startswith("models--"): - continue - - # Skip known predefined models - if item.name in known_cache_names: - continue - - # Parse model name from directory (models--org--model -> org/model) - parts = item.name[8:].split("--") # Remove "models--" prefix - if len(parts) >= 2: - model_name = "/".join(parts) - else: - model_name = parts[0] if parts else item.name - - # Detect model type by checking for common patterns - is_reranker = any(keyword in model_name.lower() for keyword in [ - "reranker", "cross-encoder", "ms-marco" - ]) - is_embedding = any(keyword in model_name.lower() for keyword in [ - "embed", "bge", "e5", "jina", "minilm", "gte", "nomic", "arctic" - ]) - - # Filter based on requested type - if model_type == "reranker" and not is_reranker: - continue - if model_type == "embedding" and is_reranker: - continue - - # Calculate cache size - try: - total_size = sum( - f.stat().st_size - for f in item.rglob("*") - if f.is_file() - ) - cache_size_mb = round(total_size / (1024 * 1024), 1) - except (OSError, PermissionError): - cache_size_mb = 0 - - discovered.append({ - "profile": f"discovered:{model_name.replace('/', '-')}", - "model_name": model_name, - "cache_name": model_name, - "cache_path": str(item), - "actual_size_mb": cache_size_mb, - "description": f"Manually discovered model", - "use_case": "User-provided model", - "installed": True, - "source": "discovered", # Mark as discovered - }) - - return discovered - - -def list_models() -> Dict[str, any]: - """List available model profiles and their installation status. - - Returns: - Dictionary with model profiles, installed status, and cache info - """ - if not FASTEMBED_AVAILABLE: - return { - "success": False, - "error": "fastembed not installed. Install with: pip install codexlens[semantic]", - } - - cache_dir = get_cache_dir() - cache_exists = cache_dir.exists() - - models = [] - for profile, info in MODEL_PROFILES.items(): - model_name = info["model_name"] - - # Check if model is cached using the actual cache name - installed = False - cache_size_mb = 0 - - if cache_exists: - # Check for model directory in cache using correct cache_name - model_cache_path = _get_model_cache_path(cache_dir, info) - if model_cache_path.exists(): - installed = True - # Calculate cache size - total_size = sum( - f.stat().st_size - for f in model_cache_path.rglob("*") - if f.is_file() - ) - cache_size_mb = round(total_size / (1024 * 1024), 1) - - models.append({ - "profile": profile, - "model_name": model_name, - "dimensions": info["dimensions"], - "estimated_size_mb": info["size_mb"], - "actual_size_mb": cache_size_mb if installed else None, - "description": info["description"], - "use_case": info["use_case"], - "installed": installed, - "source": "predefined", # Mark as predefined - "recommended": info.get("recommended", True), - }) - - # Add discovered models (manually placed by user) - discovered = scan_discovered_models(model_type="embedding") - for model in discovered: - # Try to estimate dimensions based on common model patterns - dimensions = 768 # Default - name_lower = model["model_name"].lower() - if "small" in name_lower or "mini" in name_lower: - dimensions = 384 - elif "large" in name_lower: - dimensions = 1024 - - model["dimensions"] = dimensions - model["estimated_size_mb"] = model.get("actual_size_mb", 0) - model["recommended"] = False # User-provided models are not recommended by default - models.append(model) - - return { - "success": True, - "result": { - "models": models, - "cache_dir": str(cache_dir), - "cache_exists": cache_exists, - "manual_install_guide": { - "steps": [ - "1. Download: huggingface-cli download /", - "2. Or copy to cache directory (see paths below)", - "3. Refresh to see discovered models" - ], - "example": "huggingface-cli download BAAI/bge-small-en-v1.5", - "paths": { - "windows": "%USERPROFILE%\\.cache\\huggingface\\models----", - "linux": "~/.cache/huggingface/models----", - "macos": "~/.cache/huggingface/models----", - }, - }, - }, - } - - -def download_model(profile: str, progress_callback: Optional[callable] = None) -> Dict[str, any]: - """Download a model by profile name. - - Args: - profile: Model profile name (fast, code, multilingual, balanced) - progress_callback: Optional callback function to report progress - - Returns: - Result dictionary with success status - """ - if not FASTEMBED_AVAILABLE: - return { - "success": False, - "error": "fastembed not installed. Install with: pip install codexlens[semantic]", - } - - if profile not in MODEL_PROFILES: - return { - "success": False, - "error": f"Unknown profile: {profile}. Available: {', '.join(MODEL_PROFILES.keys())}", - } - - info = MODEL_PROFILES[profile] - model_name = info["model_name"] - - try: - # Get cache directory - cache_dir = get_cache_dir() - - # Download model by instantiating TextEmbedding with explicit cache_dir - # This ensures fastembed uses the correct HuggingFace Hub cache location - if progress_callback: - progress_callback(f"Downloading {model_name}...") - - # CRITICAL: Must specify cache_dir to use HuggingFace cache - # and call embed() to trigger actual download - embedder = TextEmbedding(model_name=model_name, cache_dir=str(cache_dir)) - - # Trigger actual download by calling embed - # TextEmbedding.__init__ alone doesn't download files - if progress_callback: - progress_callback(f"Initializing {model_name}...") - - list(embedder.embed(["test"])) # Trigger download - - if progress_callback: - progress_callback(f"Model {model_name} downloaded successfully") - - # Get cache info using correct HuggingFace Hub path - model_cache_path = _get_model_cache_path(cache_dir, info) - - cache_size = 0 - if model_cache_path.exists(): - total_size = sum( - f.stat().st_size - for f in model_cache_path.rglob("*") - if f.is_file() - ) - cache_size = round(total_size / (1024 * 1024), 1) - - return { - "success": True, - "result": { - "profile": profile, - "model_name": model_name, - "cache_size_mb": cache_size, - "cache_path": str(model_cache_path), - }, - } - - except Exception as e: - return { - "success": False, - "error": f"Failed to download model: {str(e)}", - } - - -def download_custom_model(model_name: str, model_type: str = "embedding", progress_callback: Optional[callable] = None) -> Dict[str, any]: - """Download a custom model by HuggingFace model name. - - This allows users to download any HuggingFace model directly from - HuggingFace Hub. The model will be placed in the standard cache - directory where it can be discovered by scan_discovered_models(). - - Note: Downloaded models may not be directly usable by FastEmbed unless - they are in ONNX format. This function is primarily for downloading - models that users want to use with other frameworks or custom code. - - Args: - model_name: Full HuggingFace model name (e.g., "intfloat/e5-small-v2") - model_type: Type of model ("embedding" or "reranker") - for metadata only - progress_callback: Optional callback function to report progress - - Returns: - Result dictionary with success status - """ - if not HUGGINGFACE_HUB_AVAILABLE: - return { - "success": False, - "error": "huggingface_hub not installed. Install with: pip install huggingface_hub", - } - - # Validate model name format (org/model-name) - if not model_name or "/" not in model_name: - return { - "success": False, - "error": "Invalid model name format. Expected: 'org/model-name' (e.g., 'intfloat/e5-small-v2')", - } - - try: - cache_dir = get_cache_dir() - - if progress_callback: - progress_callback(f"Checking model format for {model_name}...") - - # Check if model contains ONNX files before downloading - try: - files = list_repo_files(repo_id=model_name) - has_onnx = any( - f.endswith('.onnx') or - f.startswith('onnx/') or - '/onnx/' in f or - f == 'model.onnx' - for f in files - ) - - if not has_onnx: - return { - "success": False, - "error": f"Model '{model_name}' does not contain ONNX files. " - f"FastEmbed requires ONNX-format models. " - f"Try Xenova/* versions or check the recommended models list.", - "files_found": len(files), - "suggestion": "Use models from the 'Recommended Models' list, or search for ONNX versions (e.g., Xenova/*).", - } - - if progress_callback: - progress_callback(f"ONNX format detected. Downloading {model_name}...") - - except Exception as check_err: - # If we can't check, warn but allow download - if progress_callback: - progress_callback(f"Could not verify format, proceeding with download...") - - # Use huggingface_hub to download the model - # This downloads to the standard HuggingFace cache directory - local_path = snapshot_download( - repo_id=model_name, - cache_dir=str(cache_dir), - ) - - if progress_callback: - progress_callback(f"Model {model_name} downloaded successfully") - - # Get cache info - sanitized_name = f"models--{model_name.replace('/', '--')}" - model_cache_path = cache_dir / sanitized_name - - cache_size = 0 - if model_cache_path.exists(): - total_size = sum( - f.stat().st_size - for f in model_cache_path.rglob("*") - if f.is_file() - ) - cache_size = round(total_size / (1024 * 1024), 1) - - return { - "success": True, - "result": { - "model_name": model_name, - "model_type": model_type, - "cache_size_mb": cache_size, - "cache_path": str(model_cache_path), - "local_path": local_path, - "note": "Model downloaded. Note: Only ONNX-format models are compatible with FastEmbed.", - }, - } - - except Exception as e: - return { - "success": False, - "error": f"Failed to download custom model: {str(e)}", - } - - -def delete_model(profile: str) -> Dict[str, any]: - """Delete a downloaded model from cache. - - Args: - profile: Model profile name to delete - - Returns: - Result dictionary with success status - """ - if profile not in MODEL_PROFILES: - return { - "success": False, - "error": f"Unknown profile: {profile}. Available: {', '.join(MODEL_PROFILES.keys())}", - } - - info = MODEL_PROFILES[profile] - model_name = info["model_name"] - cache_dir = get_cache_dir() - model_cache_path = _get_model_cache_path(cache_dir, info) - - if not model_cache_path.exists(): - return { - "success": False, - "error": f"Model {profile} ({model_name}) is not installed", - } - - try: - # Calculate size before deletion - total_size = sum( - f.stat().st_size - for f in model_cache_path.rglob("*") - if f.is_file() - ) - size_mb = round(total_size / (1024 * 1024), 1) - - # Delete model directory - shutil.rmtree(model_cache_path) - - return { - "success": True, - "result": { - "profile": profile, - "model_name": model_name, - "deleted_size_mb": size_mb, - "cache_path": str(model_cache_path), - }, - } - - except Exception as e: - return { - "success": False, - "error": f"Failed to delete model: {str(e)}", - } - - -def get_model_info(profile: str) -> Dict[str, any]: - """Get detailed information about a model profile. - - Args: - profile: Model profile name - - Returns: - Result dictionary with model information - """ - if profile not in MODEL_PROFILES: - return { - "success": False, - "error": f"Unknown profile: {profile}. Available: {', '.join(MODEL_PROFILES.keys())}", - } - - info = MODEL_PROFILES[profile] - model_name = info["model_name"] - - # Check installation status using correct cache_name - cache_dir = get_cache_dir() - model_cache_path = _get_model_cache_path(cache_dir, info) - installed = model_cache_path.exists() - - cache_size_mb = None - if installed: - total_size = sum( - f.stat().st_size - for f in model_cache_path.rglob("*") - if f.is_file() - ) - cache_size_mb = round(total_size / (1024 * 1024), 1) - - return { - "success": True, - "result": { - "profile": profile, - "model_name": model_name, - "dimensions": info["dimensions"], - "estimated_size_mb": info["size_mb"], - "actual_size_mb": cache_size_mb, - "description": info["description"], - "use_case": info["use_case"], - "installed": installed, - "cache_path": str(model_cache_path) if installed else None, - }, - } - - -# ============================================================================ -# Reranker Model Management Functions -# ============================================================================ - - -def list_reranker_models() -> Dict[str, any]: - """List available reranker model profiles and their installation status. - - Returns: - Dictionary with reranker model profiles, installed status, and cache info - """ - if not RERANKER_AVAILABLE: - return { - "success": False, - "error": "fastembed reranker not available. Install with: pip install fastembed>=0.4.0", - } - - cache_dir = get_cache_dir() - cache_exists = cache_dir.exists() - - models = [] - for profile, info in RERANKER_MODEL_PROFILES.items(): - model_name = info["model_name"] - - # Check if model is cached - installed = False - cache_size_mb = 0 - - if cache_exists: - model_cache_path = _get_model_cache_path(cache_dir, info) - if model_cache_path.exists(): - installed = True - total_size = sum( - f.stat().st_size - for f in model_cache_path.rglob("*") - if f.is_file() - ) - cache_size_mb = round(total_size / (1024 * 1024), 1) - - models.append({ - "profile": profile, - "model_name": model_name, - "estimated_size_mb": info["size_mb"], - "actual_size_mb": cache_size_mb if installed else None, - "description": info["description"], - "use_case": info["use_case"], - "installed": installed, - "recommended": info.get("recommended", True), - "source": "predefined", # Mark as predefined - }) - - # Add discovered reranker models (manually placed by user) - discovered = scan_discovered_models(model_type="reranker") - for model in discovered: - model["estimated_size_mb"] = model.get("actual_size_mb", 0) - model["recommended"] = False # User-provided models are not recommended by default - models.append(model) - - return { - "success": True, - "result": { - "models": models, - "cache_dir": str(cache_dir), - "cache_exists": cache_exists, - "manual_install_guide": { - "steps": [ - "1. Download: huggingface-cli download /", - "2. Or copy to cache directory (see paths below)", - "3. Refresh to see discovered models", - ], - "example": "huggingface-cli download BAAI/bge-reranker-base", - "paths": { - "windows": "%USERPROFILE%\\.cache\\huggingface\\models----", - "linux": "~/.cache/huggingface/models----", - "macos": "~/.cache/huggingface/models----", - }, - }, - }, - } - - -def download_reranker_model(profile: str, progress_callback: Optional[callable] = None) -> Dict[str, any]: - """Download a reranker model by profile name. - - Args: - profile: Reranker model profile name - progress_callback: Optional callback function to report progress - - Returns: - Result dictionary with success status - """ - if not RERANKER_AVAILABLE: - return { - "success": False, - "error": "fastembed reranker not available. Install with: pip install fastembed>=0.4.0", - } - - if profile not in RERANKER_MODEL_PROFILES: - return { - "success": False, - "error": f"Unknown reranker profile: {profile}. Available: {', '.join(RERANKER_MODEL_PROFILES.keys())}", - } - - info = RERANKER_MODEL_PROFILES[profile] - model_name = info["model_name"] - - try: - cache_dir = get_cache_dir() - - if progress_callback: - progress_callback(f"Downloading reranker {model_name}...") - - # Download model by instantiating TextCrossEncoder with explicit cache_dir - reranker = TextCrossEncoder(model_name=model_name, cache_dir=str(cache_dir)) - - # Trigger actual download by calling rerank - if progress_callback: - progress_callback(f"Initializing {model_name}...") - - list(reranker.rerank("test query", ["test document"])) - - if progress_callback: - progress_callback(f"Reranker {model_name} downloaded successfully") - - # Get cache info - model_cache_path = _get_model_cache_path(cache_dir, info) - - cache_size = 0 - if model_cache_path.exists(): - total_size = sum( - f.stat().st_size - for f in model_cache_path.rglob("*") - if f.is_file() - ) - cache_size = round(total_size / (1024 * 1024), 1) - - return { - "success": True, - "result": { - "profile": profile, - "model_name": model_name, - "cache_size_mb": cache_size, - "cache_path": str(model_cache_path), - }, - } - - except Exception as e: - return { - "success": False, - "error": f"Failed to download reranker model: {str(e)}", - } - - -def delete_reranker_model(profile: str) -> Dict[str, any]: - """Delete a downloaded reranker model from cache. - - Args: - profile: Reranker model profile name to delete - - Returns: - Result dictionary with success status - """ - if profile not in RERANKER_MODEL_PROFILES: - return { - "success": False, - "error": f"Unknown reranker profile: {profile}. Available: {', '.join(RERANKER_MODEL_PROFILES.keys())}", - } - - info = RERANKER_MODEL_PROFILES[profile] - model_name = info["model_name"] - cache_dir = get_cache_dir() - model_cache_path = _get_model_cache_path(cache_dir, info) - - if not model_cache_path.exists(): - return { - "success": False, - "error": f"Reranker model {profile} ({model_name}) is not installed", - } - - try: - total_size = sum( - f.stat().st_size - for f in model_cache_path.rglob("*") - if f.is_file() - ) - size_mb = round(total_size / (1024 * 1024), 1) - - shutil.rmtree(model_cache_path) - - return { - "success": True, - "result": { - "profile": profile, - "model_name": model_name, - "deleted_size_mb": size_mb, - "cache_path": str(model_cache_path), - }, - } - - except Exception as e: - return { - "success": False, - "error": f"Failed to delete reranker model: {str(e)}", - } - - -def get_reranker_model_info(profile: str) -> Dict[str, any]: - """Get detailed information about a reranker model profile. - - Args: - profile: Reranker model profile name - - Returns: - Result dictionary with model information - """ - if profile not in RERANKER_MODEL_PROFILES: - return { - "success": False, - "error": f"Unknown reranker profile: {profile}. Available: {', '.join(RERANKER_MODEL_PROFILES.keys())}", - } - - info = RERANKER_MODEL_PROFILES[profile] - model_name = info["model_name"] - - cache_dir = get_cache_dir() - model_cache_path = _get_model_cache_path(cache_dir, info) - installed = model_cache_path.exists() - - cache_size_mb = None - if installed: - total_size = sum( - f.stat().st_size - for f in model_cache_path.rglob("*") - if f.is_file() - ) - cache_size_mb = round(total_size / (1024 * 1024), 1) - - return { - "success": True, - "result": { - "profile": profile, - "model_name": model_name, - "estimated_size_mb": info["size_mb"], - "actual_size_mb": cache_size_mb, - "description": info["description"], - "use_case": info["use_case"], - "installed": installed, - "recommended": info.get("recommended", True), - "cache_path": str(model_cache_path) if installed else None, - }, - } diff --git a/codex-lens/src/codexlens/cli/output.py b/codex-lens/src/codexlens/cli/output.py deleted file mode 100644 index 1abfb4d2..00000000 --- a/codex-lens/src/codexlens/cli/output.py +++ /dev/null @@ -1,135 +0,0 @@ -"""Rich and JSON output helpers for CodexLens CLI.""" - -from __future__ import annotations - -import json -import sys -from dataclasses import asdict, is_dataclass -from pathlib import Path -from typing import Any, Iterable, Mapping, Sequence - -from rich.console import Console -from rich.table import Table -from rich.text import Text - -from codexlens.entities import SearchResult, Symbol - -# Force UTF-8 encoding for Windows console to properly display Chinese text -# Use force_terminal=True and legacy_windows=False to avoid GBK encoding issues -console = Console(force_terminal=True, legacy_windows=False) - - -def _to_jsonable(value: Any) -> Any: - if value is None: - return None - if hasattr(value, "model_dump"): - return value.model_dump() - if is_dataclass(value): - return asdict(value) - if isinstance(value, Path): - return str(value) - if isinstance(value, Mapping): - return {k: _to_jsonable(v) for k, v in value.items()} - if isinstance(value, (list, tuple, set)): - return [_to_jsonable(v) for v in value] - return value - - -def print_json(*, success: bool, result: Any = None, error: str | None = None, **kwargs: Any) -> None: - """Print JSON output with optional additional fields. - - Args: - success: Whether the operation succeeded - result: Result data (used when success=True) - error: Error message (used when success=False) - **kwargs: Additional fields to include in the payload (e.g., code, details) - """ - payload: dict[str, Any] = {"success": success} - if success: - payload["result"] = _to_jsonable(result) - else: - payload["error"] = error or "Unknown error" - # Include additional error details if provided - for key, value in kwargs.items(): - payload[key] = _to_jsonable(value) - console.print_json(json.dumps(payload, ensure_ascii=False)) - - -def render_search_results( - results: Sequence[SearchResult], *, title: str = "Search Results", verbose: bool = False -) -> None: - """Render search results with optional source tags in verbose mode. - - Args: - results: Search results to display - title: Table title - verbose: If True, show search source tags ([E], [F], [V]) and fusion scores - """ - table = Table(title=title, show_lines=False) - - if verbose: - # Verbose mode: show source tags - table.add_column("Source", style="dim", width=6, justify="center") - - table.add_column("Path", style="cyan", no_wrap=True) - table.add_column("Score", style="magenta", justify="right") - table.add_column("Excerpt", style="white") - - for res in results: - excerpt = res.excerpt or "" - score_str = f"{res.score:.3f}" - - if verbose: - # Extract search source tag if available - source = getattr(res, "search_source", None) - source_tag = "" - if source == "exact": - source_tag = "[E]" - elif source == "fuzzy": - source_tag = "[F]" - elif source == "vector": - source_tag = "[V]" - elif source == "fusion": - source_tag = "[RRF]" - table.add_row(source_tag, res.path, score_str, excerpt) - else: - table.add_row(res.path, score_str, excerpt) - - console.print(table) - - -def render_symbols(symbols: Sequence[Symbol], *, title: str = "Symbols") -> None: - table = Table(title=title) - table.add_column("Name", style="green") - table.add_column("Kind", style="yellow") - table.add_column("Range", style="white", justify="right") - - for sym in symbols: - start, end = sym.range - table.add_row(sym.name, sym.kind, f"{start}-{end}") - - console.print(table) - - -def render_status(stats: Mapping[str, Any]) -> None: - table = Table(title="Index Status") - table.add_column("Metric", style="cyan") - table.add_column("Value", style="white") - - for key, value in stats.items(): - if isinstance(value, Mapping): - value_text = ", ".join(f"{k}:{v}" for k, v in value.items()) - elif isinstance(value, (list, tuple)): - value_text = ", ".join(str(v) for v in value) - else: - value_text = str(value) - table.add_row(str(key), value_text) - - console.print(table) - - -def render_file_inspect(path: str, language: str, symbols: Iterable[Symbol]) -> None: - header = Text.assemble(("File: ", "bold"), (path, "cyan"), (" Language: ", "bold"), (language, "green")) - console.print(header) - render_symbols(list(symbols), title="Discovered Symbols") - diff --git a/codex-lens/src/codexlens/config.py b/codex-lens/src/codexlens/config.py deleted file mode 100644 index 527560f7..00000000 --- a/codex-lens/src/codexlens/config.py +++ /dev/null @@ -1,1164 +0,0 @@ -"""Configuration system for CodexLens.""" - -from __future__ import annotations - -import json -import logging -import os -from dataclasses import dataclass, field -from functools import cached_property -from pathlib import Path -from typing import Any, Dict, List, Optional - -from .errors import ConfigError - - -# Workspace-local directory name -WORKSPACE_DIR_NAME = ".codexlens" - -# Settings file name -SETTINGS_FILE_NAME = "settings.json" - -# Dense vector storage names (centralized storage) -VECTORS_HNSW_NAME = "_vectors.hnsw" -VECTORS_META_DB_NAME = "_vectors_meta.db" -BINARY_VECTORS_MMAP_NAME = "_binary_vectors.mmap" - -log = logging.getLogger(__name__) - - -def _default_global_dir() -> Path: - """Get global CodexLens data directory.""" - env_override = os.getenv("CODEXLENS_DATA_DIR") - if env_override: - return Path(env_override).expanduser().resolve() - return (Path.home() / ".codexlens").resolve() - - -def find_workspace_root(start_path: Path) -> Optional[Path]: - """Find the workspace root by looking for .codexlens directory. - - Searches from start_path upward to find an existing .codexlens directory. - Returns None if not found. - """ - current = start_path.resolve() - - # Search up to filesystem root - while current != current.parent: - workspace_dir = current / WORKSPACE_DIR_NAME - if workspace_dir.is_dir(): - return current - current = current.parent - - # Check root as well - workspace_dir = current / WORKSPACE_DIR_NAME - if workspace_dir.is_dir(): - return current - - return None - - -@dataclass -class Config: - """Runtime configuration for CodexLens. - - - data_dir: Base directory for all persistent CodexLens data. - - venv_path: Optional virtualenv used for language tooling. - - supported_languages: Language IDs and their associated file extensions. - - parsing_rules: Per-language parsing and chunking hints. - """ - - data_dir: Path = field(default_factory=_default_global_dir) - venv_path: Path = field(default_factory=lambda: _default_global_dir() / "venv") - supported_languages: Dict[str, Dict[str, Any]] = field( - default_factory=lambda: { - # Source code languages (category: "code") - "python": {"extensions": [".py"], "tree_sitter_language": "python", "category": "code"}, - "javascript": {"extensions": [".js", ".jsx"], "tree_sitter_language": "javascript", "category": "code"}, - "typescript": {"extensions": [".ts", ".tsx"], "tree_sitter_language": "typescript", "category": "code"}, - "java": {"extensions": [".java"], "tree_sitter_language": "java", "category": "code"}, - "go": {"extensions": [".go"], "tree_sitter_language": "go", "category": "code"}, - "zig": {"extensions": [".zig"], "tree_sitter_language": "zig", "category": "code"}, - "objective-c": {"extensions": [".m", ".mm"], "tree_sitter_language": "objc", "category": "code"}, - "swift": {"extensions": [".swift"], "tree_sitter_language": "swift", "category": "code"}, - "c": {"extensions": [".c", ".h"], "tree_sitter_language": "c", "category": "code"}, - "cpp": {"extensions": [".cc", ".cpp", ".hpp", ".cxx"], "tree_sitter_language": "cpp", "category": "code"}, - "rust": {"extensions": [".rs"], "tree_sitter_language": "rust", "category": "code"}, - } - ) - parsing_rules: Dict[str, Dict[str, Any]] = field( - default_factory=lambda: { - "default": { - "max_chunk_chars": 4000, - "max_chunk_lines": 200, - "overlap_lines": 20, - } - } - ) - - llm_enabled: bool = False - llm_tool: str = "gemini" - llm_timeout_ms: int = 300000 - llm_batch_size: int = 5 - - # Hybrid chunker configuration - hybrid_max_chunk_size: int = 2000 # Max characters per chunk before LLM refinement - hybrid_llm_refinement: bool = False # Enable LLM-based semantic boundary refinement - - # Embedding configuration - embedding_backend: str = "fastembed" # "fastembed" (local) or "litellm" (API) - embedding_model: str = "code" # For fastembed: profile (fast/code/multilingual/balanced) - # For litellm: model name from config (e.g., "qwen3-embedding") - embedding_use_gpu: bool = True # For fastembed: whether to use GPU acceleration - embedding_auto_embed_missing: bool = True # Auto-build embeddings in background when indexed projects are searched without vectors - - # Indexing/search optimizations - global_symbol_index_enabled: bool = True # Enable project-wide symbol index fast path - enable_merkle_detection: bool = True # Enable content-hash based incremental indexing - ignore_patterns: List[str] = field(default_factory=list) # Additional directory ignore patterns for indexing - extension_filters: List[str] = field(default_factory=list) # Reserved for file-level filtering config - - # Graph expansion (search-time, uses precomputed neighbors) - enable_graph_expansion: bool = False - graph_expansion_depth: int = 2 - - # Optional search reranking (disabled by default) - enable_reranking: bool = False - reranking_top_k: int = 50 - symbol_boost_factor: float = 1.5 - test_file_penalty: float = 0.15 # Penalty for test/fixture paths during final ranking - generated_file_penalty: float = 0.35 # Penalty for generated/build artifact paths during final ranking - - # Optional cross-encoder reranking (second stage; requires optional reranker deps) - enable_cross_encoder_rerank: bool = False - reranker_backend: str = "onnx" - reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2" - reranker_use_gpu: bool = True # Whether reranker backends should use GPU acceleration - reranker_top_k: int = 50 - reranker_max_input_tokens: int = 8192 # Maximum tokens for reranker API batching - reranker_chunk_type_weights: Optional[Dict[str, float]] = None # Weights for chunk types: {"code": 1.0, "docstring": 0.7} - reranker_test_file_penalty: float = 0.0 # Penalty for test files (0.0-1.0, e.g., 0.2 = 20% reduction) - - # Chunk stripping configuration (for semantic embedding) - chunk_strip_comments: bool = True # Strip comments from code chunks - chunk_strip_docstrings: bool = True # Strip docstrings from code chunks - - # Cascade search configuration (two-stage retrieval) - enable_cascade_search: bool = False # Enable cascade search (coarse + fine ranking) - cascade_coarse_k: int = 100 # Number of coarse candidates from first stage - cascade_fine_k: int = 10 # Number of final results after reranking - cascade_strategy: str = "binary" # "binary", "binary_rerank" (alias: "hybrid"), "dense_rerank", or "staged" - - # Staged cascade search configuration (4-stage pipeline) - staged_coarse_k: int = 200 # Number of coarse candidates from Stage 1 binary search - staged_lsp_depth: int = 2 # LSP relationship expansion depth in Stage 2 - staged_stage2_mode: str = "precomputed" # "precomputed" (graph_neighbors) | "realtime" (LSP) | "static_global_graph" (global_relationships) - - # Static graph configuration (write relationships to global index during build) - static_graph_enabled: bool = False - static_graph_relationship_types: List[str] = field(default_factory=lambda: ["imports", "inherits"]) - - staged_realtime_lsp_timeout_s: float = 30.0 # Max time budget for realtime LSP expansion - staged_realtime_lsp_depth: int = 1 # BFS depth for realtime LSP expansion - staged_realtime_lsp_max_nodes: int = 50 # Node cap for realtime graph expansion - staged_realtime_lsp_max_seeds: int = 1 # Seed cap for realtime graph expansion - staged_realtime_lsp_max_concurrent: int = 2 # Max concurrent LSP requests during graph expansion - staged_realtime_lsp_warmup_s: float = 3.0 # Wait for server analysis after opening seed docs - staged_realtime_lsp_resolve_symbols: bool = False # If True, resolves symbol names via documentSymbol (slower) - staged_clustering_strategy: str = "auto" # "auto", "hdbscan", "dbscan", "frequency", "noop", "score", "dir_rr", "path" - staged_clustering_min_size: int = 3 # Minimum cluster size for Stage 3 grouping - enable_staged_rerank: bool = True # Enable optional cross-encoder reranking in Stage 4 - - # RRF fusion configuration - fusion_method: str = "rrf" # "simple" (weighted sum) or "rrf" (reciprocal rank fusion) - rrf_k: int = 60 # RRF constant (default 60) - - # Category-based filtering to separate code/doc results - enable_category_filter: bool = True # Enable code/doc result separation - - # Multi-endpoint configuration for litellm backend - embedding_endpoints: List[Dict[str, Any]] = field(default_factory=list) - # List of endpoint configs: [{"model": "...", "api_key": "...", "api_base": "...", "weight": 1.0}] - embedding_pool_enabled: bool = False # Enable high availability pool for embeddings - embedding_strategy: str = "latency_aware" # round_robin, latency_aware, weighted_random - embedding_cooldown: float = 60.0 # Default cooldown seconds for rate-limited endpoints - - # Reranker multi-endpoint configuration - reranker_pool_enabled: bool = False # Enable high availability pool for reranker - reranker_strategy: str = "latency_aware" # round_robin, latency_aware, weighted_random - reranker_cooldown: float = 60.0 # Default cooldown seconds for rate-limited endpoints - - # API concurrency settings - api_max_workers: int = 4 # Max concurrent API calls for embedding/reranking - api_batch_size: int = 8 # Batch size for API requests - api_batch_size_dynamic: bool = False # Enable dynamic batch size calculation - api_batch_size_utilization_factor: float = 0.8 # Use 80% of model token capacity - api_batch_size_max: int = 2048 # Absolute upper limit for batch size - chars_per_token_estimate: int = 4 # Characters per token estimation ratio - - # Parser configuration - use_astgrep: bool = False # Use ast-grep for relationship extraction (Python/JS/TS); tree-sitter is default - - def __post_init__(self) -> None: - try: - self.data_dir = self.data_dir.expanduser().resolve() - self.venv_path = self.venv_path.expanduser().resolve() - self.data_dir.mkdir(parents=True, exist_ok=True) - except PermissionError as exc: - raise ConfigError( - f"Permission denied initializing paths (data_dir={self.data_dir}, venv_path={self.venv_path}) " - f"[{type(exc).__name__}]: {exc}" - ) from exc - except OSError as exc: - raise ConfigError( - f"Filesystem error initializing paths (data_dir={self.data_dir}, venv_path={self.venv_path}) " - f"[{type(exc).__name__}]: {exc}" - ) from exc - except Exception as exc: - raise ConfigError( - f"Unexpected error initializing paths (data_dir={self.data_dir}, venv_path={self.venv_path}) " - f"[{type(exc).__name__}]: {exc}" - ) from exc - - @cached_property - def cache_dir(self) -> Path: - """Directory for transient caches.""" - return self.data_dir / "cache" - - @cached_property - def index_dir(self) -> Path: - """Directory where index artifacts are stored.""" - return self.data_dir / "index" - - @cached_property - def db_path(self) -> Path: - """Default SQLite index path.""" - return self.index_dir / "codexlens.db" - - def ensure_runtime_dirs(self) -> None: - """Create standard runtime directories if missing.""" - for directory in (self.cache_dir, self.index_dir): - try: - directory.mkdir(parents=True, exist_ok=True) - except PermissionError as exc: - raise ConfigError( - f"Permission denied creating directory {directory} [{type(exc).__name__}]: {exc}" - ) from exc - except OSError as exc: - raise ConfigError( - f"Filesystem error creating directory {directory} [{type(exc).__name__}]: {exc}" - ) from exc - except Exception as exc: - raise ConfigError( - f"Unexpected error creating directory {directory} [{type(exc).__name__}]: {exc}" - ) from exc - - def language_for_path(self, path: str | Path) -> str | None: - """Infer a supported language ID from a file path.""" - extension = Path(path).suffix.lower() - for language_id, spec in self.supported_languages.items(): - extensions: List[str] = spec.get("extensions", []) - if extension in extensions: - return language_id - return None - - def category_for_path(self, path: str | Path) -> str | None: - """Get file category ('code' or 'doc') from a file path.""" - language = self.language_for_path(path) - if language is None: - return None - spec = self.supported_languages.get(language, {}) - return spec.get("category") - - def rules_for_language(self, language_id: str) -> Dict[str, Any]: - """Get parsing rules for a specific language, falling back to defaults.""" - return {**self.parsing_rules.get("default", {}), **self.parsing_rules.get(language_id, {})} - - @cached_property - def settings_path(self) -> Path: - """Path to the settings file.""" - return self.data_dir / SETTINGS_FILE_NAME - - def save_settings(self) -> None: - """Save embedding and other settings to file.""" - embedding_config = { - "backend": self.embedding_backend, - "model": self.embedding_model, - "use_gpu": self.embedding_use_gpu, - "auto_embed_missing": self.embedding_auto_embed_missing, - "pool_enabled": self.embedding_pool_enabled, - "strategy": self.embedding_strategy, - "cooldown": self.embedding_cooldown, - } - # Include multi-endpoint config if present - if self.embedding_endpoints: - embedding_config["endpoints"] = self.embedding_endpoints - - settings = { - "embedding": embedding_config, - "llm": { - "enabled": self.llm_enabled, - "tool": self.llm_tool, - "timeout_ms": self.llm_timeout_ms, - "batch_size": self.llm_batch_size, - }, - "parsing": { - # Prefer ast-grep processors when available (experimental). - "use_astgrep": self.use_astgrep, - }, - "indexing": { - # Persist global relationship edges during index build for static graph expansion. - "static_graph_enabled": self.static_graph_enabled, - "static_graph_relationship_types": self.static_graph_relationship_types, - }, - "reranker": { - "enabled": self.enable_cross_encoder_rerank, - "backend": self.reranker_backend, - "model": self.reranker_model, - "use_gpu": self.reranker_use_gpu, - "top_k": self.reranker_top_k, - "max_input_tokens": self.reranker_max_input_tokens, - "pool_enabled": self.reranker_pool_enabled, - "strategy": self.reranker_strategy, - "cooldown": self.reranker_cooldown, - }, - "cascade": { - "strategy": self.cascade_strategy, - "coarse_k": self.cascade_coarse_k, - "fine_k": self.cascade_fine_k, - }, - "staged": { - "coarse_k": self.staged_coarse_k, - "lsp_depth": self.staged_lsp_depth, - "stage2_mode": self.staged_stage2_mode, - "realtime_lsp_timeout_s": self.staged_realtime_lsp_timeout_s, - "realtime_lsp_depth": self.staged_realtime_lsp_depth, - "realtime_lsp_max_nodes": self.staged_realtime_lsp_max_nodes, - "realtime_lsp_max_seeds": self.staged_realtime_lsp_max_seeds, - "realtime_lsp_max_concurrent": self.staged_realtime_lsp_max_concurrent, - "realtime_lsp_warmup_s": self.staged_realtime_lsp_warmup_s, - "realtime_lsp_resolve_symbols": self.staged_realtime_lsp_resolve_symbols, - "clustering_strategy": self.staged_clustering_strategy, - "clustering_min_size": self.staged_clustering_min_size, - "enable_rerank": self.enable_staged_rerank, - }, - "api": { - "max_workers": self.api_max_workers, - "batch_size": self.api_batch_size, - "batch_size_dynamic": self.api_batch_size_dynamic, - "batch_size_utilization_factor": self.api_batch_size_utilization_factor, - "batch_size_max": self.api_batch_size_max, - "chars_per_token_estimate": self.chars_per_token_estimate, - }, - "ignore_patterns": self.ignore_patterns, - "extension_filters": self.extension_filters, - } - with open(self.settings_path, "w", encoding="utf-8") as f: - json.dump(settings, f, indent=2) - - def load_settings(self) -> None: - """Load settings from file if exists.""" - if self.settings_path.exists(): - try: - with open(self.settings_path, "r", encoding="utf-8") as f: - settings = json.load(f) - - # Load embedding settings - embedding = settings.get("embedding", {}) - if "backend" in embedding: - backend = embedding["backend"] - # Support 'api' as alias for 'litellm' - if backend == "api": - backend = "litellm" - if backend in {"fastembed", "litellm"}: - self.embedding_backend = backend - else: - log.warning( - "Invalid embedding backend in %s: %r (expected 'fastembed' or 'litellm')", - self.settings_path, - embedding["backend"], - ) - if "model" in embedding: - self.embedding_model = embedding["model"] - if "use_gpu" in embedding: - self.embedding_use_gpu = embedding["use_gpu"] - if "auto_embed_missing" in embedding: - self.embedding_auto_embed_missing = embedding["auto_embed_missing"] - - # Load multi-endpoint configuration - if "endpoints" in embedding: - self.embedding_endpoints = embedding["endpoints"] - if "pool_enabled" in embedding: - self.embedding_pool_enabled = embedding["pool_enabled"] - if "strategy" in embedding: - self.embedding_strategy = embedding["strategy"] - if "cooldown" in embedding: - self.embedding_cooldown = embedding["cooldown"] - - # Load LLM settings - llm = settings.get("llm", {}) - if "enabled" in llm: - self.llm_enabled = llm["enabled"] - if "tool" in llm: - self.llm_tool = llm["tool"] - if "timeout_ms" in llm: - self.llm_timeout_ms = llm["timeout_ms"] - if "batch_size" in llm: - self.llm_batch_size = llm["batch_size"] - - # Load reranker settings - reranker = settings.get("reranker", {}) - if "enabled" in reranker: - self.enable_cross_encoder_rerank = reranker["enabled"] - if "backend" in reranker: - backend = reranker["backend"] - if backend in {"fastembed", "onnx", "api", "litellm", "legacy"}: - self.reranker_backend = backend - else: - log.warning( - "Invalid reranker backend in %s: %r (expected 'fastembed', 'onnx', 'api', 'litellm', or 'legacy')", - self.settings_path, - backend, - ) - if "model" in reranker: - self.reranker_model = reranker["model"] - if "use_gpu" in reranker: - self.reranker_use_gpu = reranker["use_gpu"] - if "top_k" in reranker: - self.reranker_top_k = reranker["top_k"] - if "max_input_tokens" in reranker: - self.reranker_max_input_tokens = reranker["max_input_tokens"] - if "pool_enabled" in reranker: - self.reranker_pool_enabled = reranker["pool_enabled"] - if "strategy" in reranker: - self.reranker_strategy = reranker["strategy"] - if "cooldown" in reranker: - self.reranker_cooldown = reranker["cooldown"] - - # Load cascade settings - cascade = settings.get("cascade", {}) - if "strategy" in cascade: - raw_strategy = cascade["strategy"] - strategy = str(raw_strategy).strip().lower() - if strategy in {"binary", "binary_rerank", "dense_rerank", "staged"}: - self.cascade_strategy = strategy - elif strategy == "hybrid": - self.cascade_strategy = "binary_rerank" - log.debug("Mapping cascade strategy 'hybrid' -> 'binary_rerank'") - else: - log.warning( - "Invalid cascade strategy in %s: %r (expected 'binary', 'binary_rerank', 'dense_rerank', or 'staged')", - self.settings_path, - raw_strategy, - ) - if "coarse_k" in cascade: - self.cascade_coarse_k = cascade["coarse_k"] - if "fine_k" in cascade: - self.cascade_fine_k = cascade["fine_k"] - - # Load staged cascade settings - staged = settings.get("staged", {}) - if isinstance(staged, dict): - if "coarse_k" in staged: - try: - self.staged_coarse_k = int(staged["coarse_k"]) - except (TypeError, ValueError): - log.warning( - "Invalid staged.coarse_k in %s: %r (expected int)", - self.settings_path, - staged["coarse_k"], - ) - if "lsp_depth" in staged: - try: - self.staged_lsp_depth = int(staged["lsp_depth"]) - except (TypeError, ValueError): - log.warning( - "Invalid staged.lsp_depth in %s: %r (expected int)", - self.settings_path, - staged["lsp_depth"], - ) - if "stage2_mode" in staged: - raw_mode = str(staged["stage2_mode"]).strip().lower() - if raw_mode in {"precomputed", "realtime", "static_global_graph"}: - self.staged_stage2_mode = raw_mode - elif raw_mode in {"live"}: - self.staged_stage2_mode = "realtime" - else: - log.warning( - "Invalid staged.stage2_mode in %s: %r " - "(expected 'precomputed', 'realtime', or 'static_global_graph')", - self.settings_path, - staged["stage2_mode"], - ) - - if "realtime_lsp_timeout_s" in staged: - try: - self.staged_realtime_lsp_timeout_s = float( - staged["realtime_lsp_timeout_s"] - ) - except (TypeError, ValueError): - log.warning( - "Invalid staged.realtime_lsp_timeout_s in %s: %r (expected float)", - self.settings_path, - staged["realtime_lsp_timeout_s"], - ) - if "realtime_lsp_depth" in staged: - try: - self.staged_realtime_lsp_depth = int( - staged["realtime_lsp_depth"] - ) - except (TypeError, ValueError): - log.warning( - "Invalid staged.realtime_lsp_depth in %s: %r (expected int)", - self.settings_path, - staged["realtime_lsp_depth"], - ) - if "realtime_lsp_max_nodes" in staged: - try: - self.staged_realtime_lsp_max_nodes = int( - staged["realtime_lsp_max_nodes"] - ) - except (TypeError, ValueError): - log.warning( - "Invalid staged.realtime_lsp_max_nodes in %s: %r (expected int)", - self.settings_path, - staged["realtime_lsp_max_nodes"], - ) - if "realtime_lsp_max_seeds" in staged: - try: - self.staged_realtime_lsp_max_seeds = int( - staged["realtime_lsp_max_seeds"] - ) - except (TypeError, ValueError): - log.warning( - "Invalid staged.realtime_lsp_max_seeds in %s: %r (expected int)", - self.settings_path, - staged["realtime_lsp_max_seeds"], - ) - if "realtime_lsp_max_concurrent" in staged: - try: - self.staged_realtime_lsp_max_concurrent = int( - staged["realtime_lsp_max_concurrent"] - ) - except (TypeError, ValueError): - log.warning( - "Invalid staged.realtime_lsp_max_concurrent in %s: %r (expected int)", - self.settings_path, - staged["realtime_lsp_max_concurrent"], - ) - if "realtime_lsp_warmup_s" in staged: - try: - self.staged_realtime_lsp_warmup_s = float( - staged["realtime_lsp_warmup_s"] - ) - except (TypeError, ValueError): - log.warning( - "Invalid staged.realtime_lsp_warmup_s in %s: %r (expected float)", - self.settings_path, - staged["realtime_lsp_warmup_s"], - ) - if "realtime_lsp_resolve_symbols" in staged: - raw = staged["realtime_lsp_resolve_symbols"] - if isinstance(raw, bool): - self.staged_realtime_lsp_resolve_symbols = raw - elif isinstance(raw, (int, float)): - self.staged_realtime_lsp_resolve_symbols = bool(raw) - elif isinstance(raw, str): - self.staged_realtime_lsp_resolve_symbols = ( - raw.strip().lower() in {"true", "1", "yes", "on"} - ) - else: - log.warning( - "Invalid staged.realtime_lsp_resolve_symbols in %s: %r (expected bool)", - self.settings_path, - raw, - ) - - if "clustering_strategy" in staged: - raw_strategy = str(staged["clustering_strategy"]).strip().lower() - allowed = { - "auto", - "hdbscan", - "dbscan", - "frequency", - "noop", - "score", - "dir_rr", - "path", - } - if raw_strategy in allowed: - self.staged_clustering_strategy = raw_strategy - elif raw_strategy in {"none", "off"}: - self.staged_clustering_strategy = "noop" - else: - log.warning( - "Invalid staged.clustering_strategy in %s: %r", - self.settings_path, - staged["clustering_strategy"], - ) - if "clustering_min_size" in staged: - try: - self.staged_clustering_min_size = int( - staged["clustering_min_size"] - ) - except (TypeError, ValueError): - log.warning( - "Invalid staged.clustering_min_size in %s: %r (expected int)", - self.settings_path, - staged["clustering_min_size"], - ) - if "enable_rerank" in staged: - raw = staged["enable_rerank"] - if isinstance(raw, bool): - self.enable_staged_rerank = raw - elif isinstance(raw, (int, float)): - self.enable_staged_rerank = bool(raw) - elif isinstance(raw, str): - self.enable_staged_rerank = ( - raw.strip().lower() in {"true", "1", "yes", "on"} - ) - else: - log.warning( - "Invalid staged.enable_rerank in %s: %r (expected bool)", - self.settings_path, - raw, - ) - - # Load parsing settings - parsing = settings.get("parsing", {}) - if isinstance(parsing, dict) and "use_astgrep" in parsing: - self.use_astgrep = bool(parsing["use_astgrep"]) - - # Load indexing settings - indexing = settings.get("indexing", {}) - if isinstance(indexing, dict): - if "static_graph_enabled" in indexing: - self.static_graph_enabled = bool(indexing["static_graph_enabled"]) - if "static_graph_relationship_types" in indexing: - raw_types = indexing["static_graph_relationship_types"] - if isinstance(raw_types, list): - allowed = {"imports", "inherits", "calls"} - cleaned = [] - for item in raw_types: - val = str(item).strip().lower() - if val and val in allowed: - cleaned.append(val) - if cleaned: - self.static_graph_relationship_types = cleaned - else: - log.warning( - "Invalid indexing.static_graph_relationship_types in %s: %r (expected list)", - self.settings_path, - raw_types, - ) - - raw_ignore_patterns = settings.get("ignore_patterns") - if raw_ignore_patterns is not None: - if isinstance(raw_ignore_patterns, list): - self.ignore_patterns = [ - str(item).strip() for item in raw_ignore_patterns - if str(item).strip() - ] - else: - log.warning( - "Invalid ignore_patterns in %s: %r (expected list)", - self.settings_path, - raw_ignore_patterns, - ) - - raw_extension_filters = settings.get("extension_filters") - if raw_extension_filters is not None: - if isinstance(raw_extension_filters, list): - self.extension_filters = [ - str(item).strip() for item in raw_extension_filters - if str(item).strip() - ] - else: - log.warning( - "Invalid extension_filters in %s: %r (expected list)", - self.settings_path, - raw_extension_filters, - ) - - # Load API settings - api = settings.get("api", {}) - if "max_workers" in api: - self.api_max_workers = api["max_workers"] - if "batch_size" in api: - self.api_batch_size = api["batch_size"] - if "batch_size_dynamic" in api: - self.api_batch_size_dynamic = api["batch_size_dynamic"] - if "batch_size_utilization_factor" in api: - self.api_batch_size_utilization_factor = api["batch_size_utilization_factor"] - if "batch_size_max" in api: - self.api_batch_size_max = api["batch_size_max"] - if "chars_per_token_estimate" in api: - self.chars_per_token_estimate = api["chars_per_token_estimate"] - except Exception as exc: - log.warning( - "Failed to load settings from %s (%s): %s", - self.settings_path, - type(exc).__name__, - exc, - ) - - # Apply .env overrides (highest priority) - self._apply_env_overrides() - - def _apply_env_overrides(self) -> None: - """Apply environment variable overrides from .env file. - - Priority: default → settings.json → .env (highest) - - Supported variables (with or without CODEXLENS_ prefix): - EMBEDDING_MODEL: Override embedding model/profile - EMBEDDING_BACKEND: Override embedding backend (fastembed/litellm) - EMBEDDING_POOL_ENABLED: Enable embedding high availability pool - EMBEDDING_STRATEGY: Load balance strategy for embedding - EMBEDDING_COOLDOWN: Rate limit cooldown for embedding - RERANKER_MODEL: Override reranker model - RERANKER_BACKEND: Override reranker backend - RERANKER_USE_GPU: Override reranker GPU usage (true/false) - RERANKER_ENABLED: Override reranker enabled state (true/false) - RERANKER_POOL_ENABLED: Enable reranker high availability pool - RERANKER_STRATEGY: Load balance strategy for reranker - RERANKER_COOLDOWN: Rate limit cooldown for reranker - """ - from .env_config import load_env_file - - env_vars = load_env_file(self.data_dir / ".env") - if not env_vars: - return - - def get_env(key: str) -> str | None: - """Get env var with or without CODEXLENS_ prefix.""" - # Check prefixed version first (Dashboard format), then unprefixed - return env_vars.get(f"CODEXLENS_{key}") or env_vars.get(key) - - def _parse_bool(value: str) -> bool: - return value.strip().lower() in {"true", "1", "yes", "on"} - - # Cascade overrides - cascade_enabled = get_env("ENABLE_CASCADE_SEARCH") - if cascade_enabled: - self.enable_cascade_search = _parse_bool(cascade_enabled) - log.debug( - "Overriding enable_cascade_search from .env: %s", - self.enable_cascade_search, - ) - - cascade_strategy = get_env("CASCADE_STRATEGY") - if cascade_strategy: - strategy = cascade_strategy.strip().lower() - if strategy in {"binary", "binary_rerank", "dense_rerank", "staged"}: - self.cascade_strategy = strategy - log.debug("Overriding cascade_strategy from .env: %s", self.cascade_strategy) - elif strategy == "hybrid": - self.cascade_strategy = "binary_rerank" - log.debug("Overriding cascade_strategy from .env: %s", self.cascade_strategy) - else: - log.warning("Invalid CASCADE_STRATEGY in .env: %r", cascade_strategy) - - cascade_coarse_k = get_env("CASCADE_COARSE_K") - if cascade_coarse_k: - try: - self.cascade_coarse_k = int(cascade_coarse_k) - log.debug("Overriding cascade_coarse_k from .env: %s", self.cascade_coarse_k) - except ValueError: - log.warning("Invalid CASCADE_COARSE_K in .env: %r", cascade_coarse_k) - - cascade_fine_k = get_env("CASCADE_FINE_K") - if cascade_fine_k: - try: - self.cascade_fine_k = int(cascade_fine_k) - log.debug("Overriding cascade_fine_k from .env: %s", self.cascade_fine_k) - except ValueError: - log.warning("Invalid CASCADE_FINE_K in .env: %r", cascade_fine_k) - - # Embedding overrides - embedding_model = get_env("EMBEDDING_MODEL") - if embedding_model: - self.embedding_model = embedding_model - log.debug("Overriding embedding_model from .env: %s", self.embedding_model) - - embedding_backend = get_env("EMBEDDING_BACKEND") - if embedding_backend: - backend = embedding_backend.lower() - # Support 'api' as alias for 'litellm' - if backend == "api": - backend = "litellm" - if backend in {"fastembed", "litellm"}: - self.embedding_backend = backend - log.debug("Overriding embedding_backend from .env: %s", backend) - else: - log.warning("Invalid EMBEDDING_BACKEND in .env: %r", embedding_backend) - - auto_embed_missing = get_env("AUTO_EMBED_MISSING") - if auto_embed_missing: - self.embedding_auto_embed_missing = _parse_bool(auto_embed_missing) - log.debug( - "Overriding embedding_auto_embed_missing from .env: %s", - self.embedding_auto_embed_missing, - ) - - embedding_pool = get_env("EMBEDDING_POOL_ENABLED") - if embedding_pool: - value = embedding_pool.lower() - self.embedding_pool_enabled = value in {"true", "1", "yes", "on"} - log.debug("Overriding embedding_pool_enabled from .env: %s", self.embedding_pool_enabled) - - embedding_strategy = get_env("EMBEDDING_STRATEGY") - if embedding_strategy: - strategy = embedding_strategy.lower() - if strategy in {"round_robin", "latency_aware", "weighted_random"}: - self.embedding_strategy = strategy - log.debug("Overriding embedding_strategy from .env: %s", strategy) - else: - log.warning("Invalid EMBEDDING_STRATEGY in .env: %r", embedding_strategy) - - embedding_cooldown = get_env("EMBEDDING_COOLDOWN") - if embedding_cooldown: - try: - self.embedding_cooldown = float(embedding_cooldown) - log.debug("Overriding embedding_cooldown from .env: %s", self.embedding_cooldown) - except ValueError: - log.warning("Invalid EMBEDDING_COOLDOWN in .env: %r", embedding_cooldown) - - # Reranker overrides - reranker_model = get_env("RERANKER_MODEL") - if reranker_model: - self.reranker_model = reranker_model - log.debug("Overriding reranker_model from .env: %s", self.reranker_model) - - reranker_backend = get_env("RERANKER_BACKEND") - if reranker_backend: - backend = reranker_backend.lower() - if backend in {"fastembed", "onnx", "api", "litellm", "legacy"}: - self.reranker_backend = backend - log.debug("Overriding reranker_backend from .env: %s", backend) - else: - log.warning("Invalid RERANKER_BACKEND in .env: %r", reranker_backend) - - reranker_use_gpu = get_env("RERANKER_USE_GPU") - if reranker_use_gpu: - self.reranker_use_gpu = _parse_bool(reranker_use_gpu) - log.debug("Overriding reranker_use_gpu from .env: %s", self.reranker_use_gpu) - - reranker_enabled = get_env("RERANKER_ENABLED") - if reranker_enabled: - value = reranker_enabled.lower() - self.enable_cross_encoder_rerank = value in {"true", "1", "yes", "on"} - log.debug("Overriding reranker_enabled from .env: %s", self.enable_cross_encoder_rerank) - - reranker_pool = get_env("RERANKER_POOL_ENABLED") - if reranker_pool: - value = reranker_pool.lower() - self.reranker_pool_enabled = value in {"true", "1", "yes", "on"} - log.debug("Overriding reranker_pool_enabled from .env: %s", self.reranker_pool_enabled) - - reranker_strategy = get_env("RERANKER_STRATEGY") - if reranker_strategy: - strategy = reranker_strategy.lower() - if strategy in {"round_robin", "latency_aware", "weighted_random"}: - self.reranker_strategy = strategy - log.debug("Overriding reranker_strategy from .env: %s", strategy) - else: - log.warning("Invalid RERANKER_STRATEGY in .env: %r", reranker_strategy) - - reranker_cooldown = get_env("RERANKER_COOLDOWN") - if reranker_cooldown: - try: - self.reranker_cooldown = float(reranker_cooldown) - log.debug("Overriding reranker_cooldown from .env: %s", self.reranker_cooldown) - except ValueError: - log.warning("Invalid RERANKER_COOLDOWN in .env: %r", reranker_cooldown) - - reranker_max_tokens = get_env("RERANKER_MAX_INPUT_TOKENS") - if reranker_max_tokens: - try: - self.reranker_max_input_tokens = int(reranker_max_tokens) - log.debug("Overriding reranker_max_input_tokens from .env: %s", self.reranker_max_input_tokens) - except ValueError: - log.warning("Invalid RERANKER_MAX_INPUT_TOKENS in .env: %r", reranker_max_tokens) - - # Reranker tuning from environment - test_penalty = get_env("RERANKER_TEST_FILE_PENALTY") - if test_penalty: - try: - self.reranker_test_file_penalty = float(test_penalty) - log.debug("Overriding reranker_test_file_penalty from .env: %s", self.reranker_test_file_penalty) - except ValueError: - log.warning("Invalid RERANKER_TEST_FILE_PENALTY in .env: %r", test_penalty) - - ranking_test_penalty = get_env("TEST_FILE_PENALTY") - if ranking_test_penalty: - try: - self.test_file_penalty = float(ranking_test_penalty) - log.debug("Overriding test_file_penalty from .env: %s", self.test_file_penalty) - except ValueError: - log.warning("Invalid TEST_FILE_PENALTY in .env: %r", ranking_test_penalty) - - generated_penalty = get_env("GENERATED_FILE_PENALTY") - if generated_penalty: - try: - self.generated_file_penalty = float(generated_penalty) - log.debug( - "Overriding generated_file_penalty from .env: %s", - self.generated_file_penalty, - ) - except ValueError: - log.warning("Invalid GENERATED_FILE_PENALTY in .env: %r", generated_penalty) - - docstring_weight = get_env("RERANKER_DOCSTRING_WEIGHT") - if docstring_weight: - try: - weight = float(docstring_weight) - self.reranker_chunk_type_weights = {"code": 1.0, "docstring": weight} - log.debug("Overriding reranker docstring weight from .env: %s", weight) - except ValueError: - log.warning("Invalid RERANKER_DOCSTRING_WEIGHT in .env: %r", docstring_weight) - - # Chunk stripping from environment - strip_comments = get_env("CHUNK_STRIP_COMMENTS") - if strip_comments: - self.chunk_strip_comments = strip_comments.lower() in ("true", "1", "yes") - log.debug("Overriding chunk_strip_comments from .env: %s", self.chunk_strip_comments) - - strip_docstrings = get_env("CHUNK_STRIP_DOCSTRINGS") - if strip_docstrings: - self.chunk_strip_docstrings = strip_docstrings.lower() in ("true", "1", "yes") - log.debug("Overriding chunk_strip_docstrings from .env: %s", self.chunk_strip_docstrings) - - # Staged cascade overrides - staged_stage2_mode = get_env("STAGED_STAGE2_MODE") - if staged_stage2_mode: - mode = staged_stage2_mode.strip().lower() - if mode in {"precomputed", "realtime", "static_global_graph"}: - self.staged_stage2_mode = mode - log.debug("Overriding staged_stage2_mode from .env: %s", self.staged_stage2_mode) - elif mode in {"live"}: - self.staged_stage2_mode = "realtime" - log.debug("Overriding staged_stage2_mode from .env: %s", self.staged_stage2_mode) - else: - log.warning("Invalid STAGED_STAGE2_MODE in .env: %r", staged_stage2_mode) - - staged_clustering_strategy = get_env("STAGED_CLUSTERING_STRATEGY") - if staged_clustering_strategy: - strategy = staged_clustering_strategy.strip().lower() - if strategy in {"auto", "hdbscan", "dbscan", "frequency", "noop", "score", "dir_rr", "path"}: - self.staged_clustering_strategy = strategy - log.debug( - "Overriding staged_clustering_strategy from .env: %s", - self.staged_clustering_strategy, - ) - elif strategy in {"none", "off"}: - self.staged_clustering_strategy = "noop" - log.debug( - "Overriding staged_clustering_strategy from .env: %s", - self.staged_clustering_strategy, - ) - else: - log.warning( - "Invalid STAGED_CLUSTERING_STRATEGY in .env: %r", - staged_clustering_strategy, - ) - - staged_clustering_min_size = get_env("STAGED_CLUSTERING_MIN_SIZE") - if staged_clustering_min_size: - try: - self.staged_clustering_min_size = int(staged_clustering_min_size) - log.debug( - "Overriding staged_clustering_min_size from .env: %s", - self.staged_clustering_min_size, - ) - except ValueError: - log.warning( - "Invalid STAGED_CLUSTERING_MIN_SIZE in .env: %r", - staged_clustering_min_size, - ) - - enable_staged_rerank = get_env("ENABLE_STAGED_RERANK") - if enable_staged_rerank: - self.enable_staged_rerank = _parse_bool(enable_staged_rerank) - log.debug("Overriding enable_staged_rerank from .env: %s", self.enable_staged_rerank) - - rt_timeout = get_env("STAGED_REALTIME_LSP_TIMEOUT_S") - if rt_timeout: - try: - self.staged_realtime_lsp_timeout_s = float(rt_timeout) - log.debug( - "Overriding staged_realtime_lsp_timeout_s from .env: %s", - self.staged_realtime_lsp_timeout_s, - ) - except ValueError: - log.warning("Invalid STAGED_REALTIME_LSP_TIMEOUT_S in .env: %r", rt_timeout) - - rt_depth = get_env("STAGED_REALTIME_LSP_DEPTH") - if rt_depth: - try: - self.staged_realtime_lsp_depth = int(rt_depth) - log.debug( - "Overriding staged_realtime_lsp_depth from .env: %s", - self.staged_realtime_lsp_depth, - ) - except ValueError: - log.warning("Invalid STAGED_REALTIME_LSP_DEPTH in .env: %r", rt_depth) - - rt_max_nodes = get_env("STAGED_REALTIME_LSP_MAX_NODES") - if rt_max_nodes: - try: - self.staged_realtime_lsp_max_nodes = int(rt_max_nodes) - log.debug( - "Overriding staged_realtime_lsp_max_nodes from .env: %s", - self.staged_realtime_lsp_max_nodes, - ) - except ValueError: - log.warning("Invalid STAGED_REALTIME_LSP_MAX_NODES in .env: %r", rt_max_nodes) - - rt_max_seeds = get_env("STAGED_REALTIME_LSP_MAX_SEEDS") - if rt_max_seeds: - try: - self.staged_realtime_lsp_max_seeds = int(rt_max_seeds) - log.debug( - "Overriding staged_realtime_lsp_max_seeds from .env: %s", - self.staged_realtime_lsp_max_seeds, - ) - except ValueError: - log.warning("Invalid STAGED_REALTIME_LSP_MAX_SEEDS in .env: %r", rt_max_seeds) - - rt_max_concurrent = get_env("STAGED_REALTIME_LSP_MAX_CONCURRENT") - if rt_max_concurrent: - try: - self.staged_realtime_lsp_max_concurrent = int(rt_max_concurrent) - log.debug( - "Overriding staged_realtime_lsp_max_concurrent from .env: %s", - self.staged_realtime_lsp_max_concurrent, - ) - except ValueError: - log.warning( - "Invalid STAGED_REALTIME_LSP_MAX_CONCURRENT in .env: %r", - rt_max_concurrent, - ) - - rt_warmup = get_env("STAGED_REALTIME_LSP_WARMUP_S") - if rt_warmup: - try: - self.staged_realtime_lsp_warmup_s = float(rt_warmup) - log.debug( - "Overriding staged_realtime_lsp_warmup_s from .env: %s", - self.staged_realtime_lsp_warmup_s, - ) - except ValueError: - log.warning("Invalid STAGED_REALTIME_LSP_WARMUP_S in .env: %r", rt_warmup) - - rt_resolve = get_env("STAGED_REALTIME_LSP_RESOLVE_SYMBOLS") - if rt_resolve: - self.staged_realtime_lsp_resolve_symbols = _parse_bool(rt_resolve) - log.debug( - "Overriding staged_realtime_lsp_resolve_symbols from .env: %s", - self.staged_realtime_lsp_resolve_symbols, - ) - - @classmethod - def load(cls) -> "Config": - """Load config with settings from file.""" - config = cls() - config.load_settings() - return config - - -@dataclass -class WorkspaceConfig: - """Workspace-local configuration for CodexLens. - - Stores index data in project/.codexlens/ directory. - """ - - workspace_root: Path - - def __post_init__(self) -> None: - self.workspace_root = Path(self.workspace_root).resolve() - - @property - def codexlens_dir(self) -> Path: - """The .codexlens directory in workspace root.""" - return self.workspace_root / WORKSPACE_DIR_NAME - - @property - def db_path(self) -> Path: - """SQLite index path for this workspace.""" - return self.codexlens_dir / "index.db" - - @property - def cache_dir(self) -> Path: - """Cache directory for this workspace.""" - return self.codexlens_dir / "cache" - - @property - def env_path(self) -> Path: - """Path to workspace .env file.""" - return self.codexlens_dir / ".env" - - def load_env(self, *, override: bool = False) -> int: - """Load .env file and apply to os.environ. - - Args: - override: If True, override existing environment variables - - Returns: - Number of variables applied - """ - from .env_config import apply_workspace_env - return apply_workspace_env(self.workspace_root, override=override) - - def get_api_config(self, prefix: str) -> dict: - """Get API configuration from environment. - - Args: - prefix: Environment variable prefix (e.g., "RERANKER", "EMBEDDING") - - Returns: - Dictionary with api_key, api_base, model, etc. - """ - from .env_config import get_api_config - return get_api_config(prefix, workspace_root=self.workspace_root) - - def initialize(self) -> None: - """Create the .codexlens directory structure.""" - try: - self.codexlens_dir.mkdir(parents=True, exist_ok=True) - self.cache_dir.mkdir(parents=True, exist_ok=True) - - # Create .gitignore to exclude cache but keep index - gitignore_path = self.codexlens_dir / ".gitignore" - if not gitignore_path.exists(): - gitignore_path.write_text( - "# CodexLens workspace data\n" - "cache/\n" - "*.log\n" - ".env\n" # Exclude .env from git - ) - except Exception as exc: - raise ConfigError(f"Failed to initialize workspace at {self.codexlens_dir}: {exc}") from exc - - def exists(self) -> bool: - """Check if workspace is already initialized.""" - return self.codexlens_dir.is_dir() and self.db_path.exists() - - @classmethod - def from_path(cls, path: Path) -> Optional["WorkspaceConfig"]: - """Create WorkspaceConfig from a path by finding workspace root. - - Returns None if no workspace found. - """ - root = find_workspace_root(path) - if root is None: - return None - return cls(workspace_root=root) - - @classmethod - def create_at(cls, path: Path) -> "WorkspaceConfig": - """Create a new workspace at the given path.""" - config = cls(workspace_root=path) - config.initialize() - return config diff --git a/codex-lens/src/codexlens/entities.py b/codex-lens/src/codexlens/entities.py deleted file mode 100644 index d569cc3e..00000000 --- a/codex-lens/src/codexlens/entities.py +++ /dev/null @@ -1,128 +0,0 @@ -"""Pydantic entity models for CodexLens.""" - -from __future__ import annotations - -import math -from enum import Enum -from typing import Any, Dict, List, Optional, Tuple - -from pydantic import BaseModel, Field, field_validator - - -class Symbol(BaseModel): - """A code symbol discovered in a file.""" - - name: str = Field(..., min_length=1) - kind: str = Field(..., min_length=1) - range: Tuple[int, int] = Field(..., description="(start_line, end_line), 1-based inclusive") - file: Optional[str] = Field(default=None, description="Full path to the file containing this symbol") - - @field_validator("range") - @classmethod - def validate_range(cls, value: Tuple[int, int]) -> Tuple[int, int]: - if len(value) != 2: - raise ValueError("range must be a (start_line, end_line) tuple") - start_line, end_line = value - if start_line < 1 or end_line < 1: - raise ValueError("range lines must be >= 1") - if end_line < start_line: - raise ValueError("end_line must be >= start_line") - return value - - -class SemanticChunk(BaseModel): - """A semantically meaningful chunk of content, optionally embedded.""" - - content: str = Field(..., min_length=1) - embedding: Optional[List[float]] = Field(default=None, description="Vector embedding for semantic search") - metadata: Dict[str, Any] = Field(default_factory=dict) - id: Optional[int] = Field(default=None, description="Database row ID") - file_path: Optional[str] = Field(default=None, description="Source file path") - - @field_validator("embedding") - @classmethod - def validate_embedding(cls, value: Optional[List[float]]) -> Optional[List[float]]: - if value is None: - return value - if not value: - raise ValueError("embedding cannot be empty when provided") - norm = math.sqrt(sum(x * x for x in value)) - epsilon = 1e-10 - if norm < epsilon: - raise ValueError("embedding cannot be a zero vector") - return value - - -class IndexedFile(BaseModel): - """An indexed source file with symbols and optional semantic chunks.""" - - path: str = Field(..., min_length=1) - language: str = Field(..., min_length=1) - symbols: List[Symbol] = Field(default_factory=list) - chunks: List[SemanticChunk] = Field(default_factory=list) - relationships: List["CodeRelationship"] = Field(default_factory=list) - - @field_validator("path", "language") - @classmethod - def strip_and_validate_nonempty(cls, value: str) -> str: - cleaned = value.strip() - if not cleaned: - raise ValueError("value cannot be blank") - return cleaned - - -class RelationshipType(str, Enum): - """Types of code relationships.""" - CALL = "calls" - INHERITS = "inherits" - IMPORTS = "imports" - - -class CodeRelationship(BaseModel): - """A relationship between code symbols (e.g., function calls, inheritance).""" - - source_symbol: str = Field(..., min_length=1, description="Name of source symbol") - target_symbol: str = Field(..., min_length=1, description="Name of target symbol") - relationship_type: RelationshipType = Field(..., description="Type of relationship (call, inherits, etc.)") - source_file: str = Field(..., min_length=1, description="File path containing source symbol") - target_file: Optional[str] = Field(default=None, description="File path containing target (None if same file)") - source_line: int = Field(..., ge=1, description="Line number where relationship occurs (1-based)") - - -class AdditionalLocation(BaseModel): - """A pointer to another location where a similar result was found. - - Used for grouping search results with similar scores and content, - where the primary result is stored in SearchResult and secondary - locations are stored in this model. - """ - - path: str = Field(..., min_length=1) - score: float = Field(..., ge=0.0) - start_line: Optional[int] = Field(default=None, description="Start line of the result (1-based)") - end_line: Optional[int] = Field(default=None, description="End line of the result (1-based)") - symbol_name: Optional[str] = Field(default=None, description="Name of matched symbol") - - -class SearchResult(BaseModel): - """A unified search result for lexical or semantic search.""" - - path: str = Field(..., min_length=1) - score: float = Field(..., ge=0.0) - excerpt: Optional[str] = None - content: Optional[str] = Field(default=None, description="Full content of matched code block") - symbol: Optional[Symbol] = None - chunk: Optional[SemanticChunk] = None - metadata: Dict[str, Any] = Field(default_factory=dict) - - # Additional context for complete code blocks - start_line: Optional[int] = Field(default=None, description="Start line of code block (1-based)") - end_line: Optional[int] = Field(default=None, description="End line of code block (1-based)") - symbol_name: Optional[str] = Field(default=None, description="Name of matched symbol/function/class") - symbol_kind: Optional[str] = Field(default=None, description="Kind of symbol (function/class/method)") - - # Field for grouping similar results - additional_locations: List["AdditionalLocation"] = Field( - default_factory=list, - description="Other locations for grouped results with similar scores and content." - ) diff --git a/codex-lens/src/codexlens/env_config.py b/codex-lens/src/codexlens/env_config.py deleted file mode 100644 index 8f1b1b0f..00000000 --- a/codex-lens/src/codexlens/env_config.py +++ /dev/null @@ -1,329 +0,0 @@ -"""Environment configuration loader for CodexLens. - -Loads .env files from workspace .codexlens directory with fallback to project root. -Provides unified access to API configurations. - -Priority order: -1. Environment variables (already set) -2. .codexlens/.env (workspace-local) -3. .env (project root) -""" - -from __future__ import annotations - -import logging -import os -from pathlib import Path -from typing import Any, Dict, Optional - -log = logging.getLogger(__name__) - -# Supported environment variables with descriptions -ENV_VARS = { - # Reranker configuration (overrides settings.json) - "RERANKER_MODEL": "Reranker model name (overrides settings.json)", - "RERANKER_BACKEND": "Reranker backend: fastembed, onnx, api, litellm, legacy", - "RERANKER_USE_GPU": "Use GPU for local reranker backends: true/false", - "RERANKER_ENABLED": "Enable reranker: true/false", - "RERANKER_API_KEY": "API key for reranker service (SiliconFlow/Cohere/Jina)", - "RERANKER_API_BASE": "Base URL for reranker API (overrides provider default)", - "RERANKER_PROVIDER": "Reranker provider: siliconflow, cohere, jina", - "RERANKER_POOL_ENABLED": "Enable reranker high availability pool: true/false", - "RERANKER_STRATEGY": "Reranker load balance strategy: round_robin, latency_aware, weighted_random", - "RERANKER_COOLDOWN": "Reranker rate limit cooldown in seconds", - # Embedding configuration (overrides settings.json) - "EMBEDDING_MODEL": "Embedding model/profile name (overrides settings.json)", - "EMBEDDING_BACKEND": "Embedding backend: fastembed, litellm", - "AUTO_EMBED_MISSING": "Auto-build embeddings in background when indexed projects are searched without vectors: true/false", - "EMBEDDING_API_KEY": "API key for embedding service", - "EMBEDDING_API_BASE": "Base URL for embedding API", - "EMBEDDING_POOL_ENABLED": "Enable embedding high availability pool: true/false", - "EMBEDDING_STRATEGY": "Embedding load balance strategy: round_robin, latency_aware, weighted_random", - "EMBEDDING_COOLDOWN": "Embedding rate limit cooldown in seconds", - # LiteLLM configuration - "LITELLM_API_KEY": "API key for LiteLLM", - "LITELLM_API_BASE": "Base URL for LiteLLM", - "LITELLM_MODEL": "LiteLLM model name", - # General configuration - "CODEXLENS_DATA_DIR": "Custom data directory path", - "CODEXLENS_DEBUG": "Enable debug mode (true/false)", - # Cascade / staged pipeline configuration - "ENABLE_CASCADE_SEARCH": "Enable cascade search (true/false)", - "CASCADE_STRATEGY": "Cascade strategy: binary, binary_rerank (alias: hybrid), dense_rerank, staged", - "CASCADE_COARSE_K": "Cascade coarse_k candidate count (int)", - "CASCADE_FINE_K": "Cascade fine_k result count (int)", - "STAGED_STAGE2_MODE": "Staged Stage 2 mode: precomputed, realtime, static_global_graph", - "STAGED_CLUSTERING_STRATEGY": "Staged clustering strategy: auto, score, path, dir_rr, noop, ...", - "STAGED_CLUSTERING_MIN_SIZE": "Staged clustering min cluster size (int)", - "ENABLE_STAGED_RERANK": "Enable staged reranking in Stage 4 (true/false)", - "STAGED_REALTIME_LSP_TIMEOUT_S": "Realtime LSP expansion timeout budget (float seconds)", - "STAGED_REALTIME_LSP_DEPTH": "Realtime LSP BFS depth (int)", - "STAGED_REALTIME_LSP_MAX_NODES": "Realtime LSP max nodes (int)", - "STAGED_REALTIME_LSP_MAX_SEEDS": "Realtime LSP max seeds (int)", - "STAGED_REALTIME_LSP_MAX_CONCURRENT": "Realtime LSP max concurrent requests (int)", - "STAGED_REALTIME_LSP_WARMUP_S": "Realtime LSP warmup wait after didOpen (float seconds)", - "STAGED_REALTIME_LSP_RESOLVE_SYMBOLS": "Resolve symbols via documentSymbol in realtime expansion (true/false)", - # Chunking configuration - "CHUNK_STRIP_COMMENTS": "Strip comments from code chunks for embedding: true/false (default: true)", - "CHUNK_STRIP_DOCSTRINGS": "Strip docstrings from code chunks for embedding: true/false (default: true)", - # Search ranking tuning - "TEST_FILE_PENALTY": "Penalty for test/fixture paths in final search ranking: 0.0-1.0 (default: 0.15)", - "GENERATED_FILE_PENALTY": "Penalty for generated/build artifact paths in final search ranking: 0.0-1.0 (default: 0.35)", - # Reranker tuning - "RERANKER_TEST_FILE_PENALTY": "Penalty for test files in reranking: 0.0-1.0 (default: 0.0)", - "RERANKER_DOCSTRING_WEIGHT": "Weight for docstring chunks in reranking: 0.0-1.0 (default: 1.0)", -} - - -def _parse_env_line(line: str) -> tuple[str, str] | None: - """Parse a single .env line, returning (key, value) or None.""" - line = line.strip() - - # Skip empty lines and comments - if not line or line.startswith("#"): - return None - - # Handle export prefix - if line.startswith("export "): - line = line[7:].strip() - - # Split on first = - if "=" not in line: - return None - - key, _, value = line.partition("=") - key = key.strip() - value = value.strip() - - # Remove surrounding quotes - if len(value) >= 2: - if (value.startswith('"') and value.endswith('"')) or \ - (value.startswith("'") and value.endswith("'")): - value = value[1:-1] - - return key, value - - -def load_env_file(env_path: Path) -> Dict[str, str]: - """Load environment variables from a .env file. - - Args: - env_path: Path to .env file - - Returns: - Dictionary of environment variables - """ - if not env_path.is_file(): - return {} - - env_vars: Dict[str, str] = {} - - try: - content = env_path.read_text(encoding="utf-8") - for line in content.splitlines(): - result = _parse_env_line(line) - if result: - key, value = result - env_vars[key] = value - except (OSError, UnicodeDecodeError) as exc: - # File access errors or encoding issues are expected and logged - log.warning("Failed to load .env file %s: %s", env_path, exc) - except Exception as exc: - # Other unexpected errors are also logged but indicate a code issue - log.warning("Unexpected error loading .env file %s: %s", env_path, exc) - - return env_vars - - -def _get_global_data_dir() -> Path: - """Get global CodexLens data directory.""" - env_override = os.environ.get("CODEXLENS_DATA_DIR") - if env_override: - return Path(env_override).expanduser().resolve() - return (Path.home() / ".codexlens").resolve() - - -def load_global_env() -> Dict[str, str]: - """Load environment variables from global ~/.codexlens/.env file. - - Returns: - Dictionary of environment variables from global config - """ - global_env_path = _get_global_data_dir() / ".env" - if global_env_path.is_file(): - env_vars = load_env_file(global_env_path) - log.debug("Loaded %d vars from global %s", len(env_vars), global_env_path) - return env_vars - return {} - - -def load_workspace_env(workspace_root: Path | None = None) -> Dict[str, str]: - """Load environment variables from workspace .env files. - - Priority (later overrides earlier): - 1. Global ~/.codexlens/.env (lowest priority) - 2. Project root .env - 3. .codexlens/.env (highest priority) - - Args: - workspace_root: Workspace root directory. If None, uses current directory. - - Returns: - Merged dictionary of environment variables - """ - if workspace_root is None: - workspace_root = Path.cwd() - - workspace_root = Path(workspace_root).resolve() - - env_vars: Dict[str, str] = {} - - # Load from global ~/.codexlens/.env (lowest priority) - global_vars = load_global_env() - if global_vars: - env_vars.update(global_vars) - - # Load from project root .env (medium priority) - root_env = workspace_root / ".env" - if root_env.is_file(): - loaded = load_env_file(root_env) - env_vars.update(loaded) - log.debug("Loaded %d vars from %s", len(loaded), root_env) - - # Load from .codexlens/.env (highest priority) - codexlens_env = workspace_root / ".codexlens" / ".env" - if codexlens_env.is_file(): - loaded = load_env_file(codexlens_env) - env_vars.update(loaded) - log.debug("Loaded %d vars from %s", len(loaded), codexlens_env) - - return env_vars - - -def apply_workspace_env(workspace_root: Path | None = None, *, override: bool = False) -> int: - """Load .env files and apply to os.environ. - - Args: - workspace_root: Workspace root directory - override: If True, override existing environment variables - - Returns: - Number of variables applied - """ - env_vars = load_workspace_env(workspace_root) - applied = 0 - - for key, value in env_vars.items(): - if override or key not in os.environ: - os.environ[key] = value - applied += 1 - log.debug("Applied env var: %s", key) - - return applied - - -def get_env(key: str, default: str | None = None, *, workspace_root: Path | None = None) -> str | None: - """Get environment variable with .env file fallback. - - Priority: - 1. os.environ (already set) - 2. .codexlens/.env - 3. .env - 4. default value - - Args: - key: Environment variable name - default: Default value if not found - workspace_root: Workspace root for .env file lookup - - Returns: - Value or default - """ - # Check os.environ first - if key in os.environ: - return os.environ[key] - - # Load from .env files - env_vars = load_workspace_env(workspace_root) - if key in env_vars: - return env_vars[key] - - return default - - -def get_api_config( - prefix: str, - *, - workspace_root: Path | None = None, - defaults: Dict[str, Any] | None = None, -) -> Dict[str, Any]: - """Get API configuration from environment. - - Loads {PREFIX}_API_KEY, {PREFIX}_API_BASE, {PREFIX}_MODEL, etc. - - Args: - prefix: Environment variable prefix (e.g., "RERANKER", "EMBEDDING") - workspace_root: Workspace root for .env file lookup - defaults: Default values - - Returns: - Dictionary with api_key, api_base, model, etc. - """ - defaults = defaults or {} - - config: Dict[str, Any] = {} - - # Standard API config fields - field_mapping = { - "api_key": f"{prefix}_API_KEY", - "api_base": f"{prefix}_API_BASE", - "model": f"{prefix}_MODEL", - "provider": f"{prefix}_PROVIDER", - "timeout": f"{prefix}_TIMEOUT", - } - - for field, env_key in field_mapping.items(): - value = get_env(env_key, workspace_root=workspace_root) - if value is not None: - # Type conversion for specific fields - if field == "timeout": - try: - config[field] = float(value) - except ValueError: - pass - else: - config[field] = value - elif field in defaults: - config[field] = defaults[field] - - return config - - -def generate_env_example() -> str: - """Generate .env.example content with all supported variables. - - Returns: - String content for .env.example file - """ - lines = [ - "# CodexLens Environment Configuration", - "# Copy this file to .codexlens/.env and fill in your values", - "", - ] - - # Group by prefix - groups: Dict[str, list] = {} - for key, desc in ENV_VARS.items(): - prefix = key.split("_")[0] - if prefix not in groups: - groups[prefix] = [] - groups[prefix].append((key, desc)) - - for prefix, items in groups.items(): - lines.append(f"# {prefix} Configuration") - for key, desc in items: - lines.append(f"# {desc}") - lines.append(f"# {key}=") - lines.append("") - - return "\n".join(lines) diff --git a/codex-lens/src/codexlens/errors.py b/codex-lens/src/codexlens/errors.py deleted file mode 100644 index cdaafa74..00000000 --- a/codex-lens/src/codexlens/errors.py +++ /dev/null @@ -1,59 +0,0 @@ -"""CodexLens exception hierarchy.""" - -from __future__ import annotations - - -class CodexLensError(Exception): - """Base class for all CodexLens errors.""" - - -class ConfigError(CodexLensError): - """Raised when configuration is invalid or cannot be loaded.""" - - -class ParseError(CodexLensError): - """Raised when parsing or indexing a file fails.""" - - -class StorageError(CodexLensError): - """Raised when reading/writing index storage fails. - - Attributes: - message: Human-readable error description - db_path: Path to the database file (if applicable) - operation: The operation that failed (e.g., 'query', 'initialize', 'migrate') - details: Additional context for debugging - """ - - def __init__( - self, - message: str, - db_path: str | None = None, - operation: str | None = None, - details: dict | None = None - ) -> None: - super().__init__(message) - self.message = message - self.db_path = db_path - self.operation = operation - self.details = details or {} - - def __str__(self) -> str: - parts = [self.message] - if self.db_path: - parts.append(f"[db: {self.db_path}]") - if self.operation: - parts.append(f"[op: {self.operation}]") - if self.details: - detail_str = ", ".join(f"{k}={v}" for k, v in self.details.items()) - parts.append(f"[{detail_str}]") - return " ".join(parts) - - -class SearchError(CodexLensError): - """Raised when a search operation fails.""" - - -class IndexNotFoundError(CodexLensError): - """Raised when a project's index cannot be found.""" - diff --git a/codex-lens/src/codexlens/hybrid_search/__init__.py b/codex-lens/src/codexlens/hybrid_search/__init__.py deleted file mode 100644 index 03dd31b3..00000000 --- a/codex-lens/src/codexlens/hybrid_search/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -"""Hybrid Search data structures for CodexLens. - -This module provides core data structures for hybrid search: -- CodeSymbolNode: Graph node representing a code symbol -- CodeAssociationGraph: Graph of code relationships -- SearchResultCluster: Clustered search results -- Range: Position range in source files -- CallHierarchyItem: LSP call hierarchy item - -Note: The search engine is in codexlens.search.hybrid_search - LSP-based expansion is in codexlens.lsp module -""" - -from codexlens.hybrid_search.data_structures import ( - CallHierarchyItem, - CodeAssociationGraph, - CodeSymbolNode, - Range, - SearchResultCluster, -) - -__all__ = [ - "CallHierarchyItem", - "CodeAssociationGraph", - "CodeSymbolNode", - "Range", - "SearchResultCluster", -] diff --git a/codex-lens/src/codexlens/hybrid_search/data_structures.py b/codex-lens/src/codexlens/hybrid_search/data_structures.py deleted file mode 100644 index 898971d0..00000000 --- a/codex-lens/src/codexlens/hybrid_search/data_structures.py +++ /dev/null @@ -1,602 +0,0 @@ -"""Core data structures for the hybrid search system. - -This module defines the fundamental data structures used throughout the -hybrid search pipeline, including code symbol representations, association -graphs, and clustered search results. -""" - -from __future__ import annotations - -from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING - -if TYPE_CHECKING: - import networkx as nx - - -@dataclass -class Range: - """Position range within a source file. - - Attributes: - start_line: Starting line number (0-based). - start_character: Starting character offset within the line. - end_line: Ending line number (0-based). - end_character: Ending character offset within the line. - """ - - start_line: int - start_character: int - end_line: int - end_character: int - - def __post_init__(self) -> None: - """Validate range values.""" - if self.start_line < 0: - raise ValueError("start_line must be >= 0") - if self.start_character < 0: - raise ValueError("start_character must be >= 0") - if self.end_line < 0: - raise ValueError("end_line must be >= 0") - if self.end_character < 0: - raise ValueError("end_character must be >= 0") - if self.end_line < self.start_line: - raise ValueError("end_line must be >= start_line") - if self.end_line == self.start_line and self.end_character < self.start_character: - raise ValueError("end_character must be >= start_character on the same line") - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary for JSON serialization.""" - return { - "start": {"line": self.start_line, "character": self.start_character}, - "end": {"line": self.end_line, "character": self.end_character}, - } - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> Range: - """Create Range from dictionary representation.""" - return cls( - start_line=data["start"]["line"], - start_character=data["start"]["character"], - end_line=data["end"]["line"], - end_character=data["end"]["character"], - ) - - @classmethod - def from_lsp_range(cls, lsp_range: Dict[str, Any]) -> Range: - """Create Range from LSP Range object. - - LSP Range format: - {"start": {"line": int, "character": int}, - "end": {"line": int, "character": int}} - """ - return cls( - start_line=lsp_range["start"]["line"], - start_character=lsp_range["start"]["character"], - end_line=lsp_range["end"]["line"], - end_character=lsp_range["end"]["character"], - ) - - -@dataclass -class CallHierarchyItem: - """LSP CallHierarchyItem for representing callers/callees. - - Attributes: - name: Symbol name (function, method, class name). - kind: Symbol kind (function, method, class, etc.). - file_path: Absolute file path where the symbol is defined. - range: Position range in the source file. - detail: Optional additional detail about the symbol. - """ - - name: str - kind: str - file_path: str - range: Range - detail: Optional[str] = None - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary for JSON serialization.""" - result: Dict[str, Any] = { - "name": self.name, - "kind": self.kind, - "file_path": self.file_path, - "range": self.range.to_dict(), - } - if self.detail: - result["detail"] = self.detail - return result - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "CallHierarchyItem": - """Create CallHierarchyItem from dictionary representation.""" - return cls( - name=data["name"], - kind=data["kind"], - file_path=data["file_path"], - range=Range.from_dict(data["range"]), - detail=data.get("detail"), - ) - - -@dataclass -class CodeSymbolNode: - """Graph node representing a code symbol. - - Attributes: - id: Unique identifier in format 'file_path:name:line'. - name: Symbol name (function, class, variable name). - kind: Symbol kind (function, class, method, variable, etc.). - file_path: Absolute file path where symbol is defined. - range: Start/end position in the source file. - embedding: Optional vector embedding for semantic search. - raw_code: Raw source code of the symbol. - docstring: Documentation string (if available). - score: Ranking score (used during reranking). - """ - - id: str - name: str - kind: str - file_path: str - range: Range - embedding: Optional[List[float]] = None - raw_code: str = "" - docstring: str = "" - score: float = 0.0 - - def __post_init__(self) -> None: - """Validate required fields.""" - if not self.id: - raise ValueError("id cannot be empty") - if not self.name: - raise ValueError("name cannot be empty") - if not self.kind: - raise ValueError("kind cannot be empty") - if not self.file_path: - raise ValueError("file_path cannot be empty") - - def __hash__(self) -> int: - """Hash based on unique ID.""" - return hash(self.id) - - def __eq__(self, other: object) -> bool: - """Equality based on unique ID.""" - if not isinstance(other, CodeSymbolNode): - return False - return self.id == other.id - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary for JSON serialization.""" - result: Dict[str, Any] = { - "id": self.id, - "name": self.name, - "kind": self.kind, - "file_path": self.file_path, - "range": self.range.to_dict(), - "score": self.score, - } - if self.raw_code: - result["raw_code"] = self.raw_code - if self.docstring: - result["docstring"] = self.docstring - # Exclude embedding from serialization (too large for JSON responses) - return result - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> CodeSymbolNode: - """Create CodeSymbolNode from dictionary representation.""" - return cls( - id=data["id"], - name=data["name"], - kind=data["kind"], - file_path=data["file_path"], - range=Range.from_dict(data["range"]), - embedding=data.get("embedding"), - raw_code=data.get("raw_code", ""), - docstring=data.get("docstring", ""), - score=data.get("score", 0.0), - ) - - @classmethod - def from_lsp_location( - cls, - uri: str, - name: str, - kind: str, - lsp_range: Dict[str, Any], - raw_code: str = "", - docstring: str = "", - ) -> CodeSymbolNode: - """Create CodeSymbolNode from LSP location data. - - Args: - uri: File URI (file:// prefix will be stripped). - name: Symbol name. - kind: Symbol kind. - lsp_range: LSP Range object. - raw_code: Optional raw source code. - docstring: Optional documentation string. - - Returns: - New CodeSymbolNode instance. - """ - # Strip file:// prefix if present - file_path = uri - if file_path.startswith("file://"): - file_path = file_path[7:] - # Handle Windows paths (file:///C:/...) - if len(file_path) > 2 and file_path[0] == "/" and file_path[2] == ":": - file_path = file_path[1:] - - range_obj = Range.from_lsp_range(lsp_range) - symbol_id = f"{file_path}:{name}:{range_obj.start_line}" - - return cls( - id=symbol_id, - name=name, - kind=kind, - file_path=file_path, - range=range_obj, - raw_code=raw_code, - docstring=docstring, - ) - - @classmethod - def create_id(cls, file_path: str, name: str, line: int) -> str: - """Generate a unique symbol ID. - - Args: - file_path: Absolute file path. - name: Symbol name. - line: Start line number. - - Returns: - Unique ID string in format 'file_path:name:line'. - """ - return f"{file_path}:{name}:{line}" - - -@dataclass -class CodeAssociationGraph: - """Graph of code relationships between symbols. - - This graph represents the association between code symbols discovered - through LSP queries (references, call hierarchy, etc.). - - Attributes: - nodes: Dictionary mapping symbol IDs to CodeSymbolNode objects. - edges: List of (from_id, to_id, relationship_type) tuples. - relationship_type: 'calls', 'references', 'inherits', 'imports'. - """ - - nodes: Dict[str, CodeSymbolNode] = field(default_factory=dict) - edges: List[Tuple[str, str, str]] = field(default_factory=list) - - def add_node(self, node: CodeSymbolNode) -> None: - """Add a node to the graph. - - Args: - node: CodeSymbolNode to add. If a node with the same ID exists, - it will be replaced. - """ - self.nodes[node.id] = node - - def add_edge(self, from_id: str, to_id: str, rel_type: str) -> None: - """Add an edge to the graph. - - Args: - from_id: Source node ID. - to_id: Target node ID. - rel_type: Relationship type ('calls', 'references', 'inherits', 'imports'). - - Raises: - ValueError: If from_id or to_id not in graph nodes. - """ - if from_id not in self.nodes: - raise ValueError(f"Source node '{from_id}' not found in graph") - if to_id not in self.nodes: - raise ValueError(f"Target node '{to_id}' not found in graph") - - edge = (from_id, to_id, rel_type) - if edge not in self.edges: - self.edges.append(edge) - - def add_edge_unchecked(self, from_id: str, to_id: str, rel_type: str) -> None: - """Add an edge without validating node existence. - - Use this method during bulk graph construction where nodes may be - added after edges, or when performance is critical. - - Args: - from_id: Source node ID. - to_id: Target node ID. - rel_type: Relationship type. - """ - edge = (from_id, to_id, rel_type) - if edge not in self.edges: - self.edges.append(edge) - - def get_node(self, node_id: str) -> Optional[CodeSymbolNode]: - """Get a node by ID. - - Args: - node_id: Node ID to look up. - - Returns: - CodeSymbolNode if found, None otherwise. - """ - return self.nodes.get(node_id) - - def get_neighbors(self, node_id: str, rel_type: Optional[str] = None) -> List[CodeSymbolNode]: - """Get neighboring nodes connected by outgoing edges. - - Args: - node_id: Node ID to find neighbors for. - rel_type: Optional filter by relationship type. - - Returns: - List of neighboring CodeSymbolNode objects. - """ - neighbors = [] - for from_id, to_id, edge_rel in self.edges: - if from_id == node_id: - if rel_type is None or edge_rel == rel_type: - node = self.nodes.get(to_id) - if node: - neighbors.append(node) - return neighbors - - def get_incoming(self, node_id: str, rel_type: Optional[str] = None) -> List[CodeSymbolNode]: - """Get nodes connected by incoming edges. - - Args: - node_id: Node ID to find incoming connections for. - rel_type: Optional filter by relationship type. - - Returns: - List of CodeSymbolNode objects with edges pointing to node_id. - """ - incoming = [] - for from_id, to_id, edge_rel in self.edges: - if to_id == node_id: - if rel_type is None or edge_rel == rel_type: - node = self.nodes.get(from_id) - if node: - incoming.append(node) - return incoming - - def to_networkx(self) -> "nx.DiGraph": - """Convert to NetworkX DiGraph for graph algorithms. - - Returns: - NetworkX directed graph with nodes and edges. - - Raises: - ImportError: If networkx is not installed. - """ - try: - import networkx as nx - except ImportError: - raise ImportError( - "networkx is required for graph algorithms. " - "Install with: pip install networkx" - ) - - graph = nx.DiGraph() - - # Add nodes with attributes - for node_id, node in self.nodes.items(): - graph.add_node( - node_id, - name=node.name, - kind=node.kind, - file_path=node.file_path, - score=node.score, - ) - - # Add edges with relationship type - for from_id, to_id, rel_type in self.edges: - graph.add_edge(from_id, to_id, relationship=rel_type) - - return graph - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary for JSON serialization. - - Returns: - Dictionary with 'nodes' and 'edges' keys. - """ - return { - "nodes": {node_id: node.to_dict() for node_id, node in self.nodes.items()}, - "edges": [ - {"from": from_id, "to": to_id, "relationship": rel_type} - for from_id, to_id, rel_type in self.edges - ], - } - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> CodeAssociationGraph: - """Create CodeAssociationGraph from dictionary representation. - - Args: - data: Dictionary with 'nodes' and 'edges' keys. - - Returns: - New CodeAssociationGraph instance. - """ - graph = cls() - - # Load nodes - for node_id, node_data in data.get("nodes", {}).items(): - graph.nodes[node_id] = CodeSymbolNode.from_dict(node_data) - - # Load edges - for edge_data in data.get("edges", []): - graph.edges.append(( - edge_data["from"], - edge_data["to"], - edge_data["relationship"], - )) - - return graph - - def __len__(self) -> int: - """Return the number of nodes in the graph.""" - return len(self.nodes) - - -@dataclass -class SearchResultCluster: - """Clustered search result containing related code symbols. - - Search results are grouped into clusters based on graph community - detection or embedding similarity. Each cluster represents a - conceptually related group of code symbols. - - Attributes: - cluster_id: Unique cluster identifier. - score: Cluster relevance score (max of symbol scores). - title: Human-readable cluster title/summary. - symbols: List of CodeSymbolNode in this cluster. - metadata: Additional cluster metadata. - """ - - cluster_id: str - score: float - title: str - symbols: List[CodeSymbolNode] = field(default_factory=list) - metadata: Dict[str, Any] = field(default_factory=dict) - - def __post_init__(self) -> None: - """Validate cluster fields.""" - if not self.cluster_id: - raise ValueError("cluster_id cannot be empty") - if self.score < 0: - raise ValueError("score must be >= 0") - - def add_symbol(self, symbol: CodeSymbolNode) -> None: - """Add a symbol to the cluster. - - Args: - symbol: CodeSymbolNode to add. - """ - self.symbols.append(symbol) - - def get_top_symbols(self, n: int = 5) -> List[CodeSymbolNode]: - """Get top N symbols by score. - - Args: - n: Number of symbols to return. - - Returns: - List of top N CodeSymbolNode objects sorted by score descending. - """ - sorted_symbols = sorted(self.symbols, key=lambda s: s.score, reverse=True) - return sorted_symbols[:n] - - def update_score(self) -> None: - """Update cluster score to max of symbol scores.""" - if self.symbols: - self.score = max(s.score for s in self.symbols) - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary for JSON serialization. - - Returns: - Dictionary representation of the cluster. - """ - return { - "cluster_id": self.cluster_id, - "score": self.score, - "title": self.title, - "symbols": [s.to_dict() for s in self.symbols], - "metadata": self.metadata, - } - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> SearchResultCluster: - """Create SearchResultCluster from dictionary representation. - - Args: - data: Dictionary with cluster data. - - Returns: - New SearchResultCluster instance. - """ - return cls( - cluster_id=data["cluster_id"], - score=data["score"], - title=data["title"], - symbols=[CodeSymbolNode.from_dict(s) for s in data.get("symbols", [])], - metadata=data.get("metadata", {}), - ) - - def __len__(self) -> int: - """Return the number of symbols in the cluster.""" - return len(self.symbols) - - -@dataclass -class CallHierarchyItem: - """LSP CallHierarchyItem for representing callers/callees. - - Attributes: - name: Symbol name (function, method, etc.). - kind: Symbol kind (function, method, etc.). - file_path: Absolute file path. - range: Position range in the file. - detail: Optional additional detail (e.g., signature). - """ - - name: str - kind: str - file_path: str - range: Range - detail: Optional[str] = None - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary for JSON serialization.""" - result: Dict[str, Any] = { - "name": self.name, - "kind": self.kind, - "file_path": self.file_path, - "range": self.range.to_dict(), - } - if self.detail: - result["detail"] = self.detail - return result - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "CallHierarchyItem": - """Create CallHierarchyItem from dictionary representation.""" - return cls( - name=data.get("name", "unknown"), - kind=data.get("kind", "unknown"), - file_path=data.get("file_path", data.get("uri", "")), - range=Range.from_dict(data.get("range", {"start": {"line": 0, "character": 0}, "end": {"line": 0, "character": 0}})), - detail=data.get("detail"), - ) - - @classmethod - def from_lsp(cls, data: Dict[str, Any]) -> "CallHierarchyItem": - """Create CallHierarchyItem from LSP response format. - - LSP uses 0-based line numbers and 'character' instead of 'char'. - """ - uri = data.get("uri", data.get("file_path", "")) - # Strip file:// prefix - file_path = uri - if file_path.startswith("file://"): - file_path = file_path[7:] - if len(file_path) > 2 and file_path[0] == "/" and file_path[2] == ":": - file_path = file_path[1:] - - return cls( - name=data.get("name", "unknown"), - kind=str(data.get("kind", "unknown")), - file_path=file_path, - range=Range.from_lsp_range(data.get("range", {"start": {"line": 0, "character": 0}, "end": {"line": 0, "character": 0}})), - detail=data.get("detail"), - ) diff --git a/codex-lens/src/codexlens/indexing/README.md b/codex-lens/src/codexlens/indexing/README.md deleted file mode 100644 index 7377874d..00000000 --- a/codex-lens/src/codexlens/indexing/README.md +++ /dev/null @@ -1,77 +0,0 @@ -# Symbol Extraction and Indexing - -This module provides symbol extraction and relationship tracking for code graph enrichment. - -## Overview - -The `SymbolExtractor` class extracts code symbols (functions, classes) and their relationships (calls, imports) from source files using regex-based pattern matching. - -## Supported Languages - -- Python (.py) -- TypeScript (.ts, .tsx) -- JavaScript (.js, .jsx) - -## Database Schema - -### Symbols Table -Stores code symbols with their location information: -- `id`: Primary key -- `qualified_name`: Fully qualified name (e.g., "module.ClassName") -- `name`: Symbol name -- `kind`: Symbol type (function, class) -- `file_path`: Path to source file -- `start_line`: Starting line number -- `end_line`: Ending line number - -### Symbol Relationships Table -Stores relationships between symbols: -- `id`: Primary key -- `source_symbol_id`: Foreign key to symbols table -- `target_symbol_fqn`: Fully qualified name of target symbol -- `relationship_type`: Type of relationship (calls, imports) -- `file_path`: Path to source file -- `line`: Line number where relationship occurs - -## Usage Example - -```python -from pathlib import Path -from codexlens.indexing.symbol_extractor import SymbolExtractor - -# Initialize extractor -db_path = Path("./code_index.db") -extractor = SymbolExtractor(db_path) -extractor.connect() - -# Extract from file -file_path = Path("src/my_module.py") -with open(file_path) as f: - content = f.read() - -symbols, relationships = extractor.extract_from_file(file_path, content) - -# Save to database -name_to_id = extractor.save_symbols(symbols) -extractor.save_relationships(relationships, name_to_id) - -# Clean up -extractor.close() -``` - -## Pattern Matching - -The extractor uses regex patterns to identify: - -- **Functions**: Function definitions (including async, export keywords) -- **Classes**: Class definitions (including export keyword) -- **Imports**: Import/require statements -- **Calls**: Function/method invocations - -## Future Enhancements - -- Tree-sitter integration for more accurate parsing -- Support for additional languages -- Method and variable extraction -- Enhanced scope tracking -- Relationship type expansion (inherits, implements, etc.) diff --git a/codex-lens/src/codexlens/indexing/__init__.py b/codex-lens/src/codexlens/indexing/__init__.py deleted file mode 100644 index 1136099f..00000000 --- a/codex-lens/src/codexlens/indexing/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Code indexing and symbol extraction.""" -from codexlens.indexing.symbol_extractor import SymbolExtractor -from codexlens.indexing.embedding import ( - BinaryEmbeddingBackend, - DenseEmbeddingBackend, - CascadeEmbeddingBackend, - get_cascade_embedder, - binarize_embedding, - pack_binary_embedding, - unpack_binary_embedding, - hamming_distance, -) - -__all__ = [ - "SymbolExtractor", - # Cascade embedding backends - "BinaryEmbeddingBackend", - "DenseEmbeddingBackend", - "CascadeEmbeddingBackend", - "get_cascade_embedder", - # Utility functions - "binarize_embedding", - "pack_binary_embedding", - "unpack_binary_embedding", - "hamming_distance", -] diff --git a/codex-lens/src/codexlens/indexing/embedding.py b/codex-lens/src/codexlens/indexing/embedding.py deleted file mode 100644 index 4175f3e5..00000000 --- a/codex-lens/src/codexlens/indexing/embedding.py +++ /dev/null @@ -1,582 +0,0 @@ -"""Multi-type embedding backends for cascade retrieval. - -This module provides embedding backends optimized for cascade retrieval: -1. BinaryEmbeddingBackend - Fast coarse filtering with binary vectors -2. DenseEmbeddingBackend - High-precision dense vectors for reranking -3. CascadeEmbeddingBackend - Combined binary + dense for two-stage retrieval - -Cascade retrieval workflow: -1. Binary search (fast, ~32 bytes/vector) -> top-K candidates -2. Dense rerank (precise, ~8KB/vector) -> final results -""" - -from __future__ import annotations - -import logging -from typing import Iterable, List, Optional, Tuple - -import numpy as np - -from codexlens.semantic.base import BaseEmbedder - -logger = logging.getLogger(__name__) - - -# ============================================================================= -# Utility Functions -# ============================================================================= - - -def binarize_embedding(embedding: np.ndarray) -> np.ndarray: - """Convert float embedding to binary vector. - - Applies sign-based quantization: values > 0 become 1, values <= 0 become 0. - - Args: - embedding: Float32 embedding of any dimension - - Returns: - Binary vector (uint8 with values 0 or 1) of same dimension - """ - return (embedding > 0).astype(np.uint8) - - -def pack_binary_embedding(binary_vector: np.ndarray) -> bytes: - """Pack binary vector into compact bytes format. - - Packs 8 binary values into each byte for storage efficiency. - For a 256-dim binary vector, output is 32 bytes. - - Args: - binary_vector: Binary vector (uint8 with values 0 or 1) - - Returns: - Packed bytes (length = ceil(dim / 8)) - """ - # Ensure vector length is multiple of 8 by padding if needed - dim = len(binary_vector) - padded_dim = ((dim + 7) // 8) * 8 - if padded_dim > dim: - padded = np.zeros(padded_dim, dtype=np.uint8) - padded[:dim] = binary_vector - binary_vector = padded - - # Pack 8 bits per byte - packed = np.packbits(binary_vector) - return packed.tobytes() - - -def unpack_binary_embedding(packed_bytes: bytes, dim: int = 256) -> np.ndarray: - """Unpack bytes back to binary vector. - - Args: - packed_bytes: Packed binary data - dim: Original vector dimension (default: 256) - - Returns: - Binary vector (uint8 with values 0 or 1) - """ - unpacked = np.unpackbits(np.frombuffer(packed_bytes, dtype=np.uint8)) - return unpacked[:dim] - - -def hamming_distance(a: bytes, b: bytes) -> int: - """Compute Hamming distance between two packed binary vectors. - - Uses XOR and popcount for efficient distance computation. - - Args: - a: First packed binary vector - b: Second packed binary vector - - Returns: - Hamming distance (number of differing bits) - """ - a_arr = np.frombuffer(a, dtype=np.uint8) - b_arr = np.frombuffer(b, dtype=np.uint8) - xor = np.bitwise_xor(a_arr, b_arr) - return int(np.unpackbits(xor).sum()) - - -# ============================================================================= -# Binary Embedding Backend -# ============================================================================= - - -class BinaryEmbeddingBackend(BaseEmbedder): - """Generate 256-dimensional binary embeddings for fast coarse retrieval. - - Uses a lightweight embedding model and applies sign-based quantization - to produce compact binary vectors (32 bytes per embedding). - - Suitable for: - - First-stage candidate retrieval - - Hamming distance-based similarity search - - Memory-constrained environments - - Model: sentence-transformers/all-MiniLM-L6-v2 (384 dim) -> quantized to 256 bits - """ - - DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" # 384 dim, fast - BINARY_DIM = 256 - - def __init__( - self, - model_name: Optional[str] = None, - use_gpu: bool = True, - ) -> None: - """Initialize binary embedding backend. - - Args: - model_name: Base embedding model name. Defaults to BAAI/bge-small-en-v1.5 - use_gpu: Whether to use GPU acceleration - """ - from codexlens.semantic import SEMANTIC_AVAILABLE - - if not SEMANTIC_AVAILABLE: - raise ImportError( - "Semantic search dependencies not available. " - "Install with: pip install codexlens[semantic]" - ) - - self._model_name = model_name or self.DEFAULT_MODEL - self._use_gpu = use_gpu - self._model = None - - # Projection matrix for dimension reduction (lazily initialized) - self._projection_matrix: Optional[np.ndarray] = None - - @property - def model_name(self) -> str: - """Return model name.""" - return self._model_name - - @property - def embedding_dim(self) -> int: - """Return binary embedding dimension (256).""" - return self.BINARY_DIM - - @property - def packed_bytes(self) -> int: - """Return packed bytes size (32 bytes for 256 bits).""" - return self.BINARY_DIM // 8 - - def _load_model(self) -> None: - """Lazy load the embedding model.""" - if self._model is not None: - return - - from fastembed import TextEmbedding - from codexlens.semantic.gpu_support import get_optimal_providers - - providers = get_optimal_providers(use_gpu=self._use_gpu, with_device_options=True) - try: - self._model = TextEmbedding( - model_name=self._model_name, - providers=providers, - ) - except TypeError: - # Fallback for older fastembed versions - self._model = TextEmbedding(model_name=self._model_name) - - logger.debug(f"BinaryEmbeddingBackend loaded model: {self._model_name}") - - def _get_projection_matrix(self, input_dim: int) -> np.ndarray: - """Get or create projection matrix for dimension reduction. - - Uses random projection with fixed seed for reproducibility. - - Args: - input_dim: Input embedding dimension from base model - - Returns: - Projection matrix of shape (input_dim, BINARY_DIM) - """ - if self._projection_matrix is not None: - return self._projection_matrix - - # Fixed seed for reproducibility across sessions - rng = np.random.RandomState(42) - # Gaussian random projection - self._projection_matrix = rng.randn(input_dim, self.BINARY_DIM).astype(np.float32) - # Normalize columns for consistent scale - norms = np.linalg.norm(self._projection_matrix, axis=0, keepdims=True) - self._projection_matrix /= (norms + 1e-8) - - return self._projection_matrix - - def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray: - """Generate binary embeddings as numpy array. - - Args: - texts: Single text or iterable of texts - - Returns: - Binary embeddings of shape (n_texts, 256) with values 0 or 1 - """ - self._load_model() - - if isinstance(texts, str): - texts = [texts] - else: - texts = list(texts) - - # Get base float embeddings - float_embeddings = np.array(list(self._model.embed(texts))) - input_dim = float_embeddings.shape[1] - - # Project to target dimension if needed - if input_dim != self.BINARY_DIM: - projection = self._get_projection_matrix(input_dim) - float_embeddings = float_embeddings @ projection - - # Binarize - return binarize_embedding(float_embeddings) - - def embed_packed(self, texts: str | Iterable[str]) -> List[bytes]: - """Generate packed binary embeddings. - - Args: - texts: Single text or iterable of texts - - Returns: - List of packed bytes (32 bytes each for 256-dim) - """ - binary = self.embed_to_numpy(texts) - return [pack_binary_embedding(vec) for vec in binary] - - -# ============================================================================= -# Dense Embedding Backend -# ============================================================================= - - -class DenseEmbeddingBackend(BaseEmbedder): - """Generate high-dimensional dense embeddings for precise reranking. - - Uses large embedding models to produce 2048-dimensional float32 vectors - for maximum retrieval quality. - - Suitable for: - - Second-stage reranking - - High-precision similarity search - - Quality-critical applications - - Model: BAAI/bge-large-en-v1.5 (1024 dim) with optional expansion - """ - - DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" # 384 dim, use small for testing - TARGET_DIM = 768 # Reduced target for faster testing - - def __init__( - self, - model_name: Optional[str] = None, - use_gpu: bool = True, - expand_dim: bool = True, - ) -> None: - """Initialize dense embedding backend. - - Args: - model_name: Dense embedding model name. Defaults to BAAI/bge-large-en-v1.5 - use_gpu: Whether to use GPU acceleration - expand_dim: If True, expand embeddings to TARGET_DIM using learned expansion - """ - from codexlens.semantic import SEMANTIC_AVAILABLE - - if not SEMANTIC_AVAILABLE: - raise ImportError( - "Semantic search dependencies not available. " - "Install with: pip install codexlens[semantic]" - ) - - self._model_name = model_name or self.DEFAULT_MODEL - self._use_gpu = use_gpu - self._expand_dim = expand_dim - self._model = None - self._native_dim: Optional[int] = None - - # Expansion matrix for dimension expansion (lazily initialized) - self._expansion_matrix: Optional[np.ndarray] = None - - @property - def model_name(self) -> str: - """Return model name.""" - return self._model_name - - @property - def embedding_dim(self) -> int: - """Return embedding dimension. - - Returns TARGET_DIM if expand_dim is True, otherwise native model dimension. - """ - if self._expand_dim: - return self.TARGET_DIM - # Return cached native dim or estimate based on model - if self._native_dim is not None: - return self._native_dim - # Model dimension estimates - model_dims = { - "BAAI/bge-large-en-v1.5": 1024, - "BAAI/bge-base-en-v1.5": 768, - "BAAI/bge-small-en-v1.5": 384, - "intfloat/multilingual-e5-large": 1024, - } - return model_dims.get(self._model_name, 1024) - - @property - def max_tokens(self) -> int: - """Return maximum token limit.""" - return 512 # Conservative default for large models - - def _load_model(self) -> None: - """Lazy load the embedding model.""" - if self._model is not None: - return - - from fastembed import TextEmbedding - from codexlens.semantic.gpu_support import get_optimal_providers - - providers = get_optimal_providers(use_gpu=self._use_gpu, with_device_options=True) - try: - self._model = TextEmbedding( - model_name=self._model_name, - providers=providers, - ) - except TypeError: - self._model = TextEmbedding(model_name=self._model_name) - - logger.debug(f"DenseEmbeddingBackend loaded model: {self._model_name}") - - def _get_expansion_matrix(self, input_dim: int) -> np.ndarray: - """Get or create expansion matrix for dimension expansion. - - Uses random orthogonal projection for information-preserving expansion. - - Args: - input_dim: Input embedding dimension from base model - - Returns: - Expansion matrix of shape (input_dim, TARGET_DIM) - """ - if self._expansion_matrix is not None: - return self._expansion_matrix - - # Fixed seed for reproducibility - rng = np.random.RandomState(123) - - # Create semi-orthogonal expansion matrix - # First input_dim columns form identity-like structure - self._expansion_matrix = np.zeros((input_dim, self.TARGET_DIM), dtype=np.float32) - - # Copy original dimensions - copy_dim = min(input_dim, self.TARGET_DIM) - self._expansion_matrix[:copy_dim, :copy_dim] = np.eye(copy_dim, dtype=np.float32) - - # Fill remaining with random projections - if self.TARGET_DIM > input_dim: - random_part = rng.randn(input_dim, self.TARGET_DIM - input_dim).astype(np.float32) - # Normalize - norms = np.linalg.norm(random_part, axis=0, keepdims=True) - random_part /= (norms + 1e-8) - self._expansion_matrix[:, input_dim:] = random_part - - return self._expansion_matrix - - def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray: - """Generate dense embeddings as numpy array. - - Args: - texts: Single text or iterable of texts - - Returns: - Dense embeddings of shape (n_texts, TARGET_DIM) as float32 - """ - self._load_model() - - if isinstance(texts, str): - texts = [texts] - else: - texts = list(texts) - - # Get base float embeddings - float_embeddings = np.array(list(self._model.embed(texts)), dtype=np.float32) - self._native_dim = float_embeddings.shape[1] - - # Expand to target dimension if needed - if self._expand_dim and self._native_dim < self.TARGET_DIM: - expansion = self._get_expansion_matrix(self._native_dim) - float_embeddings = float_embeddings @ expansion - - return float_embeddings - - -# ============================================================================= -# Cascade Embedding Backend -# ============================================================================= - - -class CascadeEmbeddingBackend(BaseEmbedder): - """Combined binary + dense embedding backend for cascade retrieval. - - Generates both binary (for fast coarse filtering) and dense (for precise - reranking) embeddings in a single pass, optimized for two-stage retrieval. - - Cascade workflow: - 1. encode_cascade() returns (binary_embeddings, dense_embeddings) - 2. Binary search: Use Hamming distance on binary vectors -> top-K candidates - 3. Dense rerank: Use cosine similarity on dense vectors -> final results - - Memory efficiency: - - Binary: 32 bytes per vector (256 bits) - - Dense: 8192 bytes per vector (2048 x float32) - - Total: ~8KB per document for full cascade support - """ - - def __init__( - self, - binary_model: Optional[str] = None, - dense_model: Optional[str] = None, - use_gpu: bool = True, - ) -> None: - """Initialize cascade embedding backend. - - Args: - binary_model: Model for binary embeddings. Defaults to BAAI/bge-small-en-v1.5 - dense_model: Model for dense embeddings. Defaults to BAAI/bge-large-en-v1.5 - use_gpu: Whether to use GPU acceleration - """ - self._binary_backend = BinaryEmbeddingBackend( - model_name=binary_model, - use_gpu=use_gpu, - ) - self._dense_backend = DenseEmbeddingBackend( - model_name=dense_model, - use_gpu=use_gpu, - expand_dim=True, - ) - self._use_gpu = use_gpu - - @property - def model_name(self) -> str: - """Return model names for both backends.""" - return f"cascade({self._binary_backend.model_name}, {self._dense_backend.model_name})" - - @property - def embedding_dim(self) -> int: - """Return dense embedding dimension (for compatibility).""" - return self._dense_backend.embedding_dim - - @property - def binary_dim(self) -> int: - """Return binary embedding dimension.""" - return self._binary_backend.embedding_dim - - @property - def dense_dim(self) -> int: - """Return dense embedding dimension.""" - return self._dense_backend.embedding_dim - - def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray: - """Generate dense embeddings (for BaseEmbedder compatibility). - - For cascade embeddings, use encode_cascade() instead. - - Args: - texts: Single text or iterable of texts - - Returns: - Dense embeddings of shape (n_texts, dense_dim) - """ - return self._dense_backend.embed_to_numpy(texts) - - def encode_cascade( - self, - texts: str | Iterable[str], - batch_size: int = 32, - ) -> Tuple[np.ndarray, np.ndarray]: - """Generate both binary and dense embeddings. - - Args: - texts: Single text or iterable of texts - batch_size: Batch size for processing - - Returns: - Tuple of: - - binary_embeddings: Shape (n_texts, 256), uint8 values 0/1 - - dense_embeddings: Shape (n_texts, 2048), float32 - """ - if isinstance(texts, str): - texts = [texts] - else: - texts = list(texts) - - binary_embeddings = self._binary_backend.embed_to_numpy(texts) - dense_embeddings = self._dense_backend.embed_to_numpy(texts) - - return binary_embeddings, dense_embeddings - - def encode_binary(self, texts: str | Iterable[str]) -> np.ndarray: - """Generate only binary embeddings. - - Args: - texts: Single text or iterable of texts - - Returns: - Binary embeddings of shape (n_texts, 256) - """ - return self._binary_backend.embed_to_numpy(texts) - - def encode_dense(self, texts: str | Iterable[str]) -> np.ndarray: - """Generate only dense embeddings. - - Args: - texts: Single text or iterable of texts - - Returns: - Dense embeddings of shape (n_texts, 2048) - """ - return self._dense_backend.embed_to_numpy(texts) - - def encode_binary_packed(self, texts: str | Iterable[str]) -> List[bytes]: - """Generate packed binary embeddings. - - Args: - texts: Single text or iterable of texts - - Returns: - List of packed bytes (32 bytes each) - """ - return self._binary_backend.embed_packed(texts) - - -# ============================================================================= -# Factory Function -# ============================================================================= - - -def get_cascade_embedder( - binary_model: Optional[str] = None, - dense_model: Optional[str] = None, - use_gpu: bool = True, -) -> CascadeEmbeddingBackend: - """Factory function to create a cascade embedder. - - Args: - binary_model: Model for binary embeddings (default: BAAI/bge-small-en-v1.5) - dense_model: Model for dense embeddings (default: BAAI/bge-large-en-v1.5) - use_gpu: Whether to use GPU acceleration - - Returns: - Configured CascadeEmbeddingBackend instance - - Example: - >>> embedder = get_cascade_embedder() - >>> binary, dense = embedder.encode_cascade(["hello world"]) - >>> binary.shape # (1, 256) - >>> dense.shape # (1, 2048) - """ - return CascadeEmbeddingBackend( - binary_model=binary_model, - dense_model=dense_model, - use_gpu=use_gpu, - ) diff --git a/codex-lens/src/codexlens/indexing/symbol_extractor.py b/codex-lens/src/codexlens/indexing/symbol_extractor.py deleted file mode 100644 index 45439e7b..00000000 --- a/codex-lens/src/codexlens/indexing/symbol_extractor.py +++ /dev/null @@ -1,277 +0,0 @@ -"""Symbol and relationship extraction from source code.""" -import re -import sqlite3 -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple - -try: - from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser -except Exception: # pragma: no cover - optional dependency / platform variance - TreeSitterSymbolParser = None # type: ignore[assignment] - - -class SymbolExtractor: - """Extract symbols and relationships from source code using regex patterns.""" - - # Pattern definitions for different languages - PATTERNS = { - 'python': { - 'function': r'^(?:async\s+)?def\s+(\w+)\s*\(', - 'class': r'^class\s+(\w+)\s*[:\(]', - 'import': r'^(?:from\s+([\w.]+)\s+)?import\s+([\w.,\s]+)', - 'call': r'(? None: - """Connect to database and ensure schema exists.""" - self.db_conn = sqlite3.connect(str(self.db_path)) - self._ensure_tables() - - def __enter__(self) -> "SymbolExtractor": - """Context manager entry: connect to database.""" - self.connect() - return self - - def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: - """Context manager exit: close database connection.""" - self.close() - - def _ensure_tables(self) -> None: - """Create symbols and relationships tables if they don't exist.""" - if not self.db_conn: - return - cursor = self.db_conn.cursor() - - # Create symbols table with qualified_name - cursor.execute(''' - CREATE TABLE IF NOT EXISTS symbols ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - qualified_name TEXT NOT NULL, - name TEXT NOT NULL, - kind TEXT NOT NULL, - file_path TEXT NOT NULL, - start_line INTEGER NOT NULL, - end_line INTEGER NOT NULL, - UNIQUE(file_path, name, start_line) - ) - ''') - - # Create relationships table with target_symbol_fqn - cursor.execute(''' - CREATE TABLE IF NOT EXISTS symbol_relationships ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - source_symbol_id INTEGER NOT NULL, - target_symbol_fqn TEXT NOT NULL, - relationship_type TEXT NOT NULL, - file_path TEXT NOT NULL, - line INTEGER, - FOREIGN KEY (source_symbol_id) REFERENCES symbols(id) ON DELETE CASCADE - ) - ''') - - # Create performance indexes - cursor.execute('CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)') - cursor.execute('CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_path)') - cursor.execute('CREATE INDEX IF NOT EXISTS idx_rel_source ON symbol_relationships(source_symbol_id)') - cursor.execute('CREATE INDEX IF NOT EXISTS idx_rel_target ON symbol_relationships(target_symbol_fqn)') - cursor.execute('CREATE INDEX IF NOT EXISTS idx_rel_type ON symbol_relationships(relationship_type)') - - self.db_conn.commit() - - def extract_from_file(self, file_path: Path, content: str) -> Tuple[List[Dict], List[Dict]]: - """Extract symbols and relationships from file content. - - Args: - file_path: Path to the source file - content: File content as string - - Returns: - Tuple of (symbols, relationships) where: - - symbols: List of symbol dicts with qualified_name, name, kind, file_path, start_line, end_line - - relationships: List of relationship dicts with source_scope, target, type, file_path, line - """ - ext = file_path.suffix.lower() - lang = self.LANGUAGE_MAP.get(ext) - - if not lang or lang not in self.PATTERNS: - return [], [] - - patterns = self.PATTERNS[lang] - symbols = [] - relationships: List[Dict] = [] - lines = content.split('\n') - - current_scope = None - - for line_num, line in enumerate(lines, 1): - # Extract function/class definitions - for kind in ['function', 'class']: - if kind in patterns: - match = re.search(patterns[kind], line) - if match: - name = match.group(1) - qualified_name = f"{file_path.stem}.{name}" - symbols.append({ - 'qualified_name': qualified_name, - 'name': name, - 'kind': kind, - 'file_path': str(file_path), - 'start_line': line_num, - 'end_line': line_num, # Simplified - would need proper parsing for actual end - }) - current_scope = name - - if TreeSitterSymbolParser is not None: - try: - ts_parser = TreeSitterSymbolParser(lang, file_path) - if ts_parser.is_available(): - indexed = ts_parser.parse(content, file_path) - if indexed is not None and indexed.relationships: - relationships = [ - { - "source_scope": r.source_symbol, - "target": r.target_symbol, - "type": r.relationship_type.value, - "file_path": str(file_path), - "line": r.source_line, - } - for r in indexed.relationships - ] - except Exception: - relationships = [] - - # Regex fallback for relationships (when tree-sitter is unavailable) - if not relationships: - current_scope = None - for line_num, line in enumerate(lines, 1): - for kind in ['function', 'class']: - if kind in patterns: - match = re.search(patterns[kind], line) - if match: - current_scope = match.group(1) - - # Extract imports - if 'import' in patterns: - match = re.search(patterns['import'], line) - if match: - import_target = match.group(1) or match.group(2) if match.lastindex >= 2 else match.group(1) - if import_target and current_scope: - relationships.append({ - 'source_scope': current_scope, - 'target': import_target.strip(), - 'type': 'imports', - 'file_path': str(file_path), - 'line': line_num, - }) - - # Extract function calls (simplified) - if 'call' in patterns and current_scope: - for match in re.finditer(patterns['call'], line): - call_name = match.group(1) - # Skip common keywords and the current function - if call_name not in ['if', 'for', 'while', 'return', 'print', 'len', 'str', 'int', 'float', 'list', 'dict', 'set', 'tuple', current_scope]: - relationships.append({ - 'source_scope': current_scope, - 'target': call_name, - 'type': 'calls', - 'file_path': str(file_path), - 'line': line_num, - }) - - return symbols, relationships - - def save_symbols(self, symbols: List[Dict]) -> Dict[str, int]: - """Save symbols to database and return name->id mapping. - - Args: - symbols: List of symbol dicts with qualified_name, name, kind, file_path, start_line, end_line - - Returns: - Dictionary mapping symbol name to database id - """ - if not self.db_conn or not symbols: - return {} - - cursor = self.db_conn.cursor() - name_to_id = {} - - for sym in symbols: - try: - cursor.execute(''' - INSERT OR IGNORE INTO symbols - (qualified_name, name, kind, file_path, start_line, end_line) - VALUES (?, ?, ?, ?, ?, ?) - ''', (sym['qualified_name'], sym['name'], sym['kind'], - sym['file_path'], sym['start_line'], sym['end_line'])) - - # Get the id - cursor.execute(''' - SELECT id FROM symbols - WHERE file_path = ? AND name = ? AND start_line = ? - ''', (sym['file_path'], sym['name'], sym['start_line'])) - - row = cursor.fetchone() - if row: - name_to_id[sym['name']] = row[0] - except sqlite3.Error: - continue - - self.db_conn.commit() - return name_to_id - - def save_relationships(self, relationships: List[Dict], name_to_id: Dict[str, int]) -> None: - """Save relationships to database. - - Args: - relationships: List of relationship dicts with source_scope, target, type, file_path, line - name_to_id: Dictionary mapping symbol names to database ids - """ - if not self.db_conn or not relationships: - return - - cursor = self.db_conn.cursor() - - for rel in relationships: - source_id = name_to_id.get(rel['source_scope']) - if source_id: - try: - cursor.execute(''' - INSERT INTO symbol_relationships - (source_symbol_id, target_symbol_fqn, relationship_type, file_path, line) - VALUES (?, ?, ?, ?, ?) - ''', (source_id, rel['target'], rel['type'], rel['file_path'], rel['line'])) - except sqlite3.Error: - continue - - self.db_conn.commit() - - def close(self) -> None: - """Close database connection.""" - if self.db_conn: - self.db_conn.close() - self.db_conn = None diff --git a/codex-lens/src/codexlens/lsp/__init__.py b/codex-lens/src/codexlens/lsp/__init__.py deleted file mode 100644 index e2c851e2..00000000 --- a/codex-lens/src/codexlens/lsp/__init__.py +++ /dev/null @@ -1,34 +0,0 @@ -"""LSP module for real-time language server integration. - -This module provides: -- LspBridge: HTTP bridge to VSCode language servers -- LspGraphBuilder: Build code association graphs via LSP -- Location: Position in a source file - -Example: - >>> from codexlens.lsp import LspBridge, LspGraphBuilder - >>> - >>> async with LspBridge() as bridge: - ... refs = await bridge.get_references(symbol) - ... graph = await LspGraphBuilder().build_from_seeds(seeds, bridge) -""" - -from codexlens.lsp.lsp_bridge import ( - CacheEntry, - Location, - LspBridge, -) -from codexlens.lsp.lsp_graph_builder import ( - LspGraphBuilder, -) - -# Alias for backward compatibility -GraphBuilder = LspGraphBuilder - -__all__ = [ - "CacheEntry", - "GraphBuilder", - "Location", - "LspBridge", - "LspGraphBuilder", -] diff --git a/codex-lens/src/codexlens/lsp/handlers.py b/codex-lens/src/codexlens/lsp/handlers.py deleted file mode 100644 index 3fb17e40..00000000 --- a/codex-lens/src/codexlens/lsp/handlers.py +++ /dev/null @@ -1,551 +0,0 @@ -"""LSP request handlers for codex-lens. - -This module contains handlers for LSP requests: -- textDocument/definition -- textDocument/completion -- workspace/symbol -- textDocument/didSave -- textDocument/hover -""" - -from __future__ import annotations - -import logging -import re -from pathlib import Path -from typing import List, Optional, Union -from urllib.parse import quote, unquote - -try: - from lsprotocol import types as lsp -except ImportError as exc: - raise ImportError( - "LSP dependencies not installed. Install with: pip install codex-lens[lsp]" - ) from exc - -from codexlens.entities import Symbol -from codexlens.lsp.server import server - -logger = logging.getLogger(__name__) - -# Symbol kind mapping from codex-lens to LSP -SYMBOL_KIND_MAP = { - "class": lsp.SymbolKind.Class, - "function": lsp.SymbolKind.Function, - "method": lsp.SymbolKind.Method, - "variable": lsp.SymbolKind.Variable, - "constant": lsp.SymbolKind.Constant, - "property": lsp.SymbolKind.Property, - "field": lsp.SymbolKind.Field, - "interface": lsp.SymbolKind.Interface, - "module": lsp.SymbolKind.Module, - "namespace": lsp.SymbolKind.Namespace, - "package": lsp.SymbolKind.Package, - "enum": lsp.SymbolKind.Enum, - "enum_member": lsp.SymbolKind.EnumMember, - "struct": lsp.SymbolKind.Struct, - "type": lsp.SymbolKind.TypeParameter, - "type_alias": lsp.SymbolKind.TypeParameter, -} - -# Completion kind mapping from codex-lens to LSP -COMPLETION_KIND_MAP = { - "class": lsp.CompletionItemKind.Class, - "function": lsp.CompletionItemKind.Function, - "method": lsp.CompletionItemKind.Method, - "variable": lsp.CompletionItemKind.Variable, - "constant": lsp.CompletionItemKind.Constant, - "property": lsp.CompletionItemKind.Property, - "field": lsp.CompletionItemKind.Field, - "interface": lsp.CompletionItemKind.Interface, - "module": lsp.CompletionItemKind.Module, - "enum": lsp.CompletionItemKind.Enum, - "enum_member": lsp.CompletionItemKind.EnumMember, - "struct": lsp.CompletionItemKind.Struct, - "type": lsp.CompletionItemKind.TypeParameter, - "type_alias": lsp.CompletionItemKind.TypeParameter, -} - - -def _path_to_uri(path: Union[str, Path]) -> str: - """Convert a file path to a URI. - - Args: - path: File path (string or Path object) - - Returns: - File URI string - """ - path_str = str(Path(path).resolve()) - # Handle Windows paths - if path_str.startswith("/"): - return f"file://{quote(path_str)}" - else: - return f"file:///{quote(path_str.replace(chr(92), '/'))}" - - -def _uri_to_path(uri: str) -> Path: - """Convert a URI to a file path. - - Args: - uri: File URI string - - Returns: - Path object - """ - path = uri.replace("file:///", "").replace("file://", "") - return Path(unquote(path)) - - -def _get_word_at_position(document_text: str, line: int, character: int) -> Optional[str]: - """Extract the word at the given position in the document. - - Args: - document_text: Full document text - line: 0-based line number - character: 0-based character position - - Returns: - Word at position, or None if no word found - """ - lines = document_text.splitlines() - if line >= len(lines): - return None - - line_text = lines[line] - if character > len(line_text): - return None - - # Find word boundaries - word_pattern = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*") - for match in word_pattern.finditer(line_text): - if match.start() <= character <= match.end(): - return match.group() - - return None - - -def _get_prefix_at_position(document_text: str, line: int, character: int) -> str: - """Extract the incomplete word prefix at the given position. - - Args: - document_text: Full document text - line: 0-based line number - character: 0-based character position - - Returns: - Prefix string (may be empty) - """ - lines = document_text.splitlines() - if line >= len(lines): - return "" - - line_text = lines[line] - if character > len(line_text): - character = len(line_text) - - # Extract text before cursor - before_cursor = line_text[:character] - - # Find the start of the current word - match = re.search(r"[a-zA-Z_][a-zA-Z0-9_]*$", before_cursor) - if match: - return match.group() - - return "" - - -def symbol_to_location(symbol: Symbol) -> Optional[lsp.Location]: - """Convert a codex-lens Symbol to an LSP Location. - - Args: - symbol: codex-lens Symbol object - - Returns: - LSP Location, or None if symbol has no file - """ - if not symbol.file: - return None - - # LSP uses 0-based lines, codex-lens uses 1-based - start_line = max(0, symbol.range[0] - 1) - end_line = max(0, symbol.range[1] - 1) - - return lsp.Location( - uri=_path_to_uri(symbol.file), - range=lsp.Range( - start=lsp.Position(line=start_line, character=0), - end=lsp.Position(line=end_line, character=0), - ), - ) - - -def _symbol_kind_to_lsp(kind: str) -> lsp.SymbolKind: - """Map codex-lens symbol kind to LSP SymbolKind. - - Args: - kind: codex-lens symbol kind string - - Returns: - LSP SymbolKind - """ - return SYMBOL_KIND_MAP.get(kind.lower(), lsp.SymbolKind.Variable) - - -def _symbol_kind_to_completion_kind(kind: str) -> lsp.CompletionItemKind: - """Map codex-lens symbol kind to LSP CompletionItemKind. - - Args: - kind: codex-lens symbol kind string - - Returns: - LSP CompletionItemKind - """ - return COMPLETION_KIND_MAP.get(kind.lower(), lsp.CompletionItemKind.Text) - - -# ----------------------------------------------------------------------------- -# LSP Request Handlers -# ----------------------------------------------------------------------------- - - -@server.feature(lsp.TEXT_DOCUMENT_DEFINITION) -def lsp_definition( - params: lsp.DefinitionParams, -) -> Optional[Union[lsp.Location, List[lsp.Location]]]: - """Handle textDocument/definition request. - - Finds the definition of the symbol at the cursor position. - """ - if not server.global_index: - logger.debug("No global index available for definition lookup") - return None - - # Get document - document = server.workspace.get_text_document(params.text_document.uri) - if not document: - return None - - # Get word at position - word = _get_word_at_position( - document.source, - params.position.line, - params.position.character, - ) - - if not word: - logger.debug("No word found at position") - return None - - logger.debug("Looking up definition for: %s", word) - - # Search for exact symbol match - try: - symbols = server.global_index.search( - name=word, - limit=10, - prefix_mode=False, # Exact match preferred - ) - - # Filter for exact name match - exact_matches = [s for s in symbols if s.name == word] - if not exact_matches: - # Fall back to prefix search - symbols = server.global_index.search( - name=word, - limit=10, - prefix_mode=True, - ) - exact_matches = [s for s in symbols if s.name == word] - - if not exact_matches: - logger.debug("No definition found for: %s", word) - return None - - # Convert to LSP locations - locations = [] - for sym in exact_matches: - loc = symbol_to_location(sym) - if loc: - locations.append(loc) - - if len(locations) == 1: - return locations[0] - elif locations: - return locations - else: - return None - - except Exception as exc: - logger.error("Error looking up definition: %s", exc) - return None - - -@server.feature(lsp.TEXT_DOCUMENT_REFERENCES) -def lsp_references(params: lsp.ReferenceParams) -> Optional[List[lsp.Location]]: - """Handle textDocument/references request. - - Finds all references to the symbol at the cursor position using - the code_relationships table for accurate call-site tracking. - Falls back to same-name symbol search if search_engine is unavailable. - """ - document = server.workspace.get_text_document(params.text_document.uri) - if not document: - return None - - word = _get_word_at_position( - document.source, - params.position.line, - params.position.character, - ) - - if not word: - return None - - logger.debug("Finding references for: %s", word) - - try: - # Try using search_engine.search_references() for accurate reference tracking - if server.search_engine and server.workspace_root: - references = server.search_engine.search_references( - symbol_name=word, - source_path=server.workspace_root, - limit=200, - ) - - if references: - locations = [] - for ref in references: - locations.append( - lsp.Location( - uri=_path_to_uri(ref.file_path), - range=lsp.Range( - start=lsp.Position( - line=max(0, ref.line - 1), - character=ref.column, - ), - end=lsp.Position( - line=max(0, ref.line - 1), - character=ref.column + len(word), - ), - ), - ) - ) - return locations if locations else None - - # Fallback: search for symbols with same name using global_index - if server.global_index: - symbols = server.global_index.search( - name=word, - limit=100, - prefix_mode=False, - ) - - # Filter for exact matches - exact_matches = [s for s in symbols if s.name == word] - - locations = [] - for sym in exact_matches: - loc = symbol_to_location(sym) - if loc: - locations.append(loc) - - return locations if locations else None - - return None - - except Exception as exc: - logger.error("Error finding references: %s", exc) - return None - - -@server.feature(lsp.TEXT_DOCUMENT_COMPLETION) -def lsp_completion(params: lsp.CompletionParams) -> Optional[lsp.CompletionList]: - """Handle textDocument/completion request. - - Provides code completion suggestions based on indexed symbols. - """ - if not server.global_index: - return None - - document = server.workspace.get_text_document(params.text_document.uri) - if not document: - return None - - prefix = _get_prefix_at_position( - document.source, - params.position.line, - params.position.character, - ) - - if not prefix or len(prefix) < 2: - # Require at least 2 characters for completion - return None - - logger.debug("Completing prefix: %s", prefix) - - try: - symbols = server.global_index.search( - name=prefix, - limit=50, - prefix_mode=True, - ) - - if not symbols: - return None - - # Convert to completion items - items = [] - seen_names = set() - - for sym in symbols: - if sym.name in seen_names: - continue - seen_names.add(sym.name) - - items.append( - lsp.CompletionItem( - label=sym.name, - kind=_symbol_kind_to_completion_kind(sym.kind), - detail=f"{sym.kind} - {Path(sym.file).name if sym.file else 'unknown'}", - sort_text=sym.name.lower(), - ) - ) - - return lsp.CompletionList( - is_incomplete=len(symbols) >= 50, - items=items, - ) - - except Exception as exc: - logger.error("Error getting completions: %s", exc) - return None - - -@server.feature(lsp.TEXT_DOCUMENT_HOVER) -def lsp_hover(params: lsp.HoverParams) -> Optional[lsp.Hover]: - """Handle textDocument/hover request. - - Provides hover information for the symbol at the cursor position - using HoverProvider for rich symbol information including - signature, documentation, and location. - """ - if not server.global_index: - return None - - document = server.workspace.get_text_document(params.text_document.uri) - if not document: - return None - - word = _get_word_at_position( - document.source, - params.position.line, - params.position.character, - ) - - if not word: - return None - - logger.debug("Hover for: %s", word) - - try: - # Use HoverProvider for rich symbol information - from codexlens.lsp.providers import HoverProvider - - provider = HoverProvider(server.global_index, server.registry) - info = provider.get_hover_info(word) - - if not info: - return None - - # Format as markdown with signature and location - content = provider.format_hover_markdown(info) - - return lsp.Hover( - contents=lsp.MarkupContent( - kind=lsp.MarkupKind.Markdown, - value=content, - ), - ) - - except Exception as exc: - logger.error("Error getting hover info: %s", exc) - return None - - -@server.feature(lsp.WORKSPACE_SYMBOL) -def lsp_workspace_symbol( - params: lsp.WorkspaceSymbolParams, -) -> Optional[List[lsp.SymbolInformation]]: - """Handle workspace/symbol request. - - Searches for symbols across the workspace. - """ - if not server.global_index: - return None - - query = params.query - if not query or len(query) < 2: - return None - - logger.debug("Workspace symbol search: %s", query) - - try: - symbols = server.global_index.search( - name=query, - limit=100, - prefix_mode=True, - ) - - if not symbols: - return None - - result = [] - for sym in symbols: - loc = symbol_to_location(sym) - if loc: - result.append( - lsp.SymbolInformation( - name=sym.name, - kind=_symbol_kind_to_lsp(sym.kind), - location=loc, - container_name=Path(sym.file).parent.name if sym.file else None, - ) - ) - - return result if result else None - - except Exception as exc: - logger.error("Error searching workspace symbols: %s", exc) - return None - - -@server.feature(lsp.TEXT_DOCUMENT_DID_SAVE) -def lsp_did_save(params: lsp.DidSaveTextDocumentParams) -> None: - """Handle textDocument/didSave notification. - - Triggers incremental re-indexing of the saved file. - Note: Full incremental indexing requires WatcherManager integration, - which is planned for Phase 2. - """ - file_path = _uri_to_path(params.text_document.uri) - logger.info("File saved: %s", file_path) - - # Phase 1: Just log the save event - # Phase 2 will integrate with WatcherManager for incremental indexing - # if server.watcher_manager: - # server.watcher_manager.trigger_reindex(file_path) - - -@server.feature(lsp.TEXT_DOCUMENT_DID_OPEN) -def lsp_did_open(params: lsp.DidOpenTextDocumentParams) -> None: - """Handle textDocument/didOpen notification.""" - file_path = _uri_to_path(params.text_document.uri) - logger.debug("File opened: %s", file_path) - - -@server.feature(lsp.TEXT_DOCUMENT_DID_CLOSE) -def lsp_did_close(params: lsp.DidCloseTextDocumentParams) -> None: - """Handle textDocument/didClose notification.""" - file_path = _uri_to_path(params.text_document.uri) - logger.debug("File closed: %s", file_path) diff --git a/codex-lens/src/codexlens/lsp/keepalive_bridge.py b/codex-lens/src/codexlens/lsp/keepalive_bridge.py deleted file mode 100644 index a6d3f819..00000000 --- a/codex-lens/src/codexlens/lsp/keepalive_bridge.py +++ /dev/null @@ -1,135 +0,0 @@ -"""Keep-alive wrapper for Standalone LSP servers in synchronous workflows. - -The staged realtime pipeline calls into LSP from synchronous code paths. -Creating a fresh asyncio loop per query (via asyncio.run) forces language -servers to start/stop every time, which is slow and can trigger shutdown -timeouts on Windows. - -This module runs an asyncio event loop in a background thread and keeps a -single LspBridge (and its StandaloneLspManager + subprocesses) alive across -multiple queries. Callers submit coroutines that operate on the shared bridge. -""" - -from __future__ import annotations - -import atexit -import asyncio -import threading -from dataclasses import dataclass -from typing import Awaitable, Callable, Optional, TypeVar - -from codexlens.lsp.lsp_bridge import LspBridge - -T = TypeVar("T") - - -@dataclass(frozen=True) -class KeepAliveKey: - workspace_root: str - config_file: Optional[str] - timeout: float - - -class KeepAliveLspBridge: - """Runs a shared LspBridge on a dedicated event loop thread.""" - - def __init__(self, *, workspace_root: str, config_file: Optional[str], timeout: float) -> None: - self._key = KeepAliveKey(workspace_root=workspace_root, config_file=config_file, timeout=float(timeout)) - self._lock = threading.RLock() - self._call_lock = threading.RLock() - self._ready = threading.Event() - self._thread: Optional[threading.Thread] = None - self._loop: Optional[asyncio.AbstractEventLoop] = None - self._bridge: Optional[LspBridge] = None - self._stopped = False - - atexit.register(self.stop) - - @property - def key(self) -> KeepAliveKey: - return self._key - - def start(self) -> None: - with self._lock: - if self._stopped: - raise RuntimeError("KeepAliveLspBridge is stopped") - if self._thread is not None and self._thread.is_alive(): - return - - self._ready.clear() - thread = threading.Thread(target=self._run, name="codexlens-lsp-keepalive", daemon=True) - self._thread = thread - thread.start() - - if not self._ready.wait(timeout=10.0): - raise RuntimeError("Timed out starting LSP keep-alive loop") - - def stop(self) -> None: - with self._lock: - if self._stopped: - return - self._stopped = True - loop = self._loop - bridge = self._bridge - thread = self._thread - - if loop is not None and bridge is not None: - try: - fut = asyncio.run_coroutine_threadsafe(bridge.close(), loop) - fut.result(timeout=5.0) - except Exception: - pass - try: - loop.call_soon_threadsafe(loop.stop) - except Exception: - pass - - if thread is not None: - try: - thread.join(timeout=5.0) - except Exception: - pass - - def run(self, fn: Callable[[LspBridge], Awaitable[T]], *, timeout: Optional[float] = None) -> T: - """Run an async function against the shared LspBridge and return its result.""" - self.start() - loop = self._loop - bridge = self._bridge - if loop is None or bridge is None: - raise RuntimeError("Keep-alive loop not initialized") - - async def _call() -> T: - return await fn(bridge) - - # Serialize bridge usage to avoid overlapping LSP request storms. - with self._call_lock: - fut = asyncio.run_coroutine_threadsafe(_call(), loop) - return fut.result(timeout=float(timeout or self._key.timeout) + 1.0) - - def _run(self) -> None: - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - bridge = LspBridge( - workspace_root=self._key.workspace_root, - config_file=self._key.config_file, - timeout=self._key.timeout, - ) - - with self._lock: - self._loop = loop - self._bridge = bridge - self._ready.set() - - try: - loop.run_forever() - finally: - try: - if self._bridge is not None: - loop.run_until_complete(self._bridge.close()) - except Exception: - pass - try: - loop.close() - except Exception: - pass - diff --git a/codex-lens/src/codexlens/lsp/lsp-servers.json b/codex-lens/src/codexlens/lsp/lsp-servers.json deleted file mode 100644 index bfc21fb9..00000000 --- a/codex-lens/src/codexlens/lsp/lsp-servers.json +++ /dev/null @@ -1,88 +0,0 @@ -{ - "$schema": "https://json-schema.org/draft/2020-12/schema", - "version": "1.0.0", - "description": "Default language server configuration for codex-lens standalone LSP client", - "servers": [ - { - "languageId": "python", - "displayName": "Pyright", - "extensions": ["py", "pyi"], - "command": ["pyright-langserver", "--stdio"], - "enabled": true, - "initializationOptions": { - "pythonPath": "", - "pythonPlatform": "", - "pythonVersion": "3.13" - }, - "settings": { - "python.analysis": { - "typeCheckingMode": "standard", - "diagnosticMode": "workspace", - "exclude": ["**/node_modules", "**/__pycache__", "build", "dist"], - "include": ["src/**", "tests/**"], - "stubPath": "typings" - } - } - }, - { - "languageId": "typescript", - "displayName": "TypeScript Language Server", - "extensions": ["ts", "tsx"], - "command": ["typescript-language-server", "--stdio"], - "enabled": true, - "initializationOptions": {}, - "settings": {} - }, - { - "languageId": "javascript", - "displayName": "TypeScript Language Server (for JS)", - "extensions": ["js", "jsx", "mjs", "cjs"], - "command": ["typescript-language-server", "--stdio"], - "enabled": true, - "initializationOptions": {}, - "settings": {} - }, - { - "languageId": "go", - "displayName": "Gopls", - "extensions": ["go"], - "command": ["gopls", "serve"], - "enabled": true, - "initializationOptions": {}, - "settings": {} - }, - { - "languageId": "rust", - "displayName": "Rust Analyzer", - "extensions": ["rs"], - "command": ["rust-analyzer"], - "enabled": false, - "initializationOptions": {}, - "settings": {} - }, - { - "languageId": "c", - "displayName": "Clangd", - "extensions": ["c", "h"], - "command": ["clangd"], - "enabled": false, - "initializationOptions": {}, - "settings": {} - }, - { - "languageId": "cpp", - "displayName": "Clangd", - "extensions": ["cpp", "hpp", "cc", "cxx"], - "command": ["clangd"], - "enabled": false, - "initializationOptions": {}, - "settings": {} - } - ], - "defaults": { - "rootDir": ".", - "timeout": 30000, - "restartInterval": 5000, - "maxRestarts": 3 - } -} diff --git a/codex-lens/src/codexlens/lsp/lsp_bridge.py b/codex-lens/src/codexlens/lsp/lsp_bridge.py deleted file mode 100644 index 63b30830..00000000 --- a/codex-lens/src/codexlens/lsp/lsp_bridge.py +++ /dev/null @@ -1,857 +0,0 @@ -"""LspBridge service for real-time LSP communication with caching. - -This module provides a bridge to communicate with language servers either via: -1. Standalone LSP Manager (direct subprocess communication - default) -2. VSCode Bridge extension (HTTP-based, legacy mode) - -Features: -- Direct communication with language servers (no VSCode dependency) -- Cache with TTL and file modification time invalidation -- Graceful error handling with empty results on failure -- Support for definition, references, hover, and call hierarchy -""" - -from __future__ import annotations - -import asyncio -import logging -import os -import time -from collections import OrderedDict -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any, Dict, List, Optional, TYPE_CHECKING -from urllib.parse import unquote - -logger = logging.getLogger(__name__) - -if TYPE_CHECKING: - from codexlens.lsp.standalone_manager import StandaloneLspManager - -# Check for optional dependencies -try: - import aiohttp - HAS_AIOHTTP = True -except ImportError: - HAS_AIOHTTP = False - -from codexlens.hybrid_search.data_structures import ( - CallHierarchyItem, - CodeSymbolNode, - Range, -) - - -@dataclass -class Location: - """A location in a source file (LSP response format).""" - - file_path: str - line: int - character: int - - def to_dict(self) -> Dict[str, Any]: - """Convert to dictionary format.""" - return { - "file_path": self.file_path, - "line": self.line, - "character": self.character, - } - - @classmethod - def from_lsp_response(cls, data: Dict[str, Any]) -> "Location": - """Create Location from LSP response format. - - Handles both direct format and VSCode URI format. - """ - # Handle VSCode URI format (file:///path/to/file) - uri = data.get("uri", data.get("file_path", "")) - if uri.startswith("file://"): - # Strip scheme and decode percent-encoding (e.g. file:///d%3A/...). - # Keep behavior compatible with both Windows and Unix paths. - raw = unquote(uri[7:]) # keep leading slash for Unix paths - # Windows: file:///C:/... or file:///c%3A/... -> C:/... - if raw.startswith("/") and len(raw) > 2 and raw[2] == ":": - raw = raw[1:] - file_path = raw - else: - file_path = uri - - # Get position from range or direct fields - if "range" in data: - range_data = data["range"] - start = range_data.get("start", {}) - line = start.get("line", 0) + 1 # LSP is 0-based, convert to 1-based - character = start.get("character", 0) + 1 - else: - line = data.get("line", 1) - character = data.get("character", 1) - - return cls(file_path=file_path, line=line, character=character) - - -@dataclass -class CacheEntry: - """A cached LSP response with expiration metadata. - - Attributes: - data: The cached response data - file_mtime: File modification time when cached (for invalidation) - cached_at: Unix timestamp when entry was cached - """ - - data: Any - file_mtime: float - cached_at: float - - -class LspBridge: - """Bridge for real-time LSP communication with language servers. - - By default, uses StandaloneLspManager to directly spawn and communicate - with language servers via JSON-RPC over stdio. No VSCode dependency required. - - For legacy mode, can use VSCode Bridge HTTP server (set use_vscode_bridge=True). - - Features: - - Direct language server communication (default) - - Response caching with TTL and file modification invalidation - - Timeout handling - - Graceful error handling returning empty results - - Example: - # Default: standalone mode (no VSCode needed) - async with LspBridge() as bridge: - refs = await bridge.get_references(symbol) - definition = await bridge.get_definition(symbol) - - # Legacy: VSCode Bridge mode - async with LspBridge(use_vscode_bridge=True) as bridge: - refs = await bridge.get_references(symbol) - """ - - DEFAULT_BRIDGE_URL = "http://127.0.0.1:3457" - DEFAULT_TIMEOUT = 30.0 # seconds (increased for standalone mode) - DEFAULT_CACHE_TTL = 300 # 5 minutes - DEFAULT_MAX_CACHE_SIZE = 1000 # Maximum cache entries - - def __init__( - self, - bridge_url: str = DEFAULT_BRIDGE_URL, - timeout: float = DEFAULT_TIMEOUT, - cache_ttl: int = DEFAULT_CACHE_TTL, - max_cache_size: int = DEFAULT_MAX_CACHE_SIZE, - use_vscode_bridge: bool = False, - workspace_root: Optional[str] = None, - config_file: Optional[str] = None, - ): - """Initialize LspBridge. - - Args: - bridge_url: URL of the VSCode Bridge HTTP server (legacy mode only) - timeout: Request timeout in seconds - cache_ttl: Cache time-to-live in seconds - max_cache_size: Maximum number of cache entries (LRU eviction) - use_vscode_bridge: If True, use VSCode Bridge HTTP mode (requires aiohttp) - workspace_root: Root directory for standalone LSP manager - config_file: Path to lsp-servers.json configuration file - """ - self.bridge_url = bridge_url - self.timeout = timeout - self.cache_ttl = cache_ttl - self.max_cache_size = max_cache_size - self.use_vscode_bridge = use_vscode_bridge - self.workspace_root = workspace_root - self.config_file = config_file - - self.cache: OrderedDict[str, CacheEntry] = OrderedDict() - - # VSCode Bridge mode (legacy) - self._session: Optional["aiohttp.ClientSession"] = None - - # Standalone mode (default) - self._manager: Optional["StandaloneLspManager"] = None - self._manager_started = False - - # Validate dependencies - if use_vscode_bridge and not HAS_AIOHTTP: - raise ImportError( - "aiohttp is required for VSCode Bridge mode: pip install aiohttp" - ) - - async def _ensure_manager(self) -> "StandaloneLspManager": - """Ensure standalone LSP manager is started.""" - if self._manager is None: - from codexlens.lsp.standalone_manager import StandaloneLspManager - self._manager = StandaloneLspManager( - workspace_root=self.workspace_root, - config_file=self.config_file, - timeout=self.timeout, - ) - - if not self._manager_started: - await self._manager.start() - self._manager_started = True - - return self._manager - - async def _get_session(self) -> "aiohttp.ClientSession": - """Get or create the aiohttp session (VSCode Bridge mode only).""" - if not HAS_AIOHTTP: - raise ImportError("aiohttp required for VSCode Bridge mode") - - if self._session is None or self._session.closed: - timeout = aiohttp.ClientTimeout(total=self.timeout) - self._session = aiohttp.ClientSession(timeout=timeout) - return self._session - - async def close(self) -> None: - """Close connections and cleanup resources.""" - # Close VSCode Bridge session - if self._session and not self._session.closed: - await self._session.close() - self._session = None - - # Stop standalone manager - if self._manager and self._manager_started: - await self._manager.stop() - self._manager_started = False - - def _get_file_mtime(self, file_path: str) -> float: - """Get file modification time, or 0 if file doesn't exist.""" - try: - return os.path.getmtime(file_path) - except OSError: - return 0.0 - - def _is_cached(self, cache_key: str, file_path: str) -> bool: - """Check if cache entry is valid. - - Cache is invalid if: - - Entry doesn't exist - - TTL has expired - - File has been modified since caching - - Args: - cache_key: The cache key to check - file_path: Path to source file for mtime check - - Returns: - True if cache is valid and can be used - """ - if cache_key not in self.cache: - return False - - entry = self.cache[cache_key] - now = time.time() - - # Check TTL - if now - entry.cached_at > self.cache_ttl: - del self.cache[cache_key] - return False - - # Check file modification time - current_mtime = self._get_file_mtime(file_path) - if current_mtime != entry.file_mtime: - del self.cache[cache_key] - return False - - # Move to end on access (LRU behavior) - self.cache.move_to_end(cache_key) - return True - - def _cache(self, key: str, file_path: str, data: Any) -> None: - """Store data in cache with LRU eviction. - - Args: - key: Cache key - file_path: Path to source file (for mtime tracking) - data: Data to cache - """ - # Remove oldest entries if at capacity - while len(self.cache) >= self.max_cache_size: - self.cache.popitem(last=False) # Remove oldest (FIFO order) - - # Move to end if key exists (update access order) - if key in self.cache: - self.cache.move_to_end(key) - - self.cache[key] = CacheEntry( - data=data, - file_mtime=self._get_file_mtime(file_path), - cached_at=time.time(), - ) - - def clear_cache(self) -> None: - """Clear all cached entries.""" - self.cache.clear() - - async def _request_vscode_bridge(self, action: str, params: Dict[str, Any]) -> Any: - """Make HTTP request to VSCode Bridge (legacy mode). - - Args: - action: The endpoint/action name (e.g., "get_definition") - params: Request parameters - - Returns: - Response data on success, None on failure - """ - url = f"{self.bridge_url}/{action}" - - try: - session = await self._get_session() - async with session.post(url, json=params) as response: - if response.status != 200: - return None - - data = await response.json() - if data.get("success") is False: - return None - - return data.get("result") - - except asyncio.TimeoutError: - return None - except Exception: - return None - - async def get_references(self, symbol: CodeSymbolNode) -> List[Location]: - """Get all references to a symbol via real-time LSP. - - Args: - symbol: The code symbol to find references for - - Returns: - List of Location objects where the symbol is referenced. - Returns empty list on error or timeout. - """ - cache_key = f"refs:{symbol.id}" - - if self._is_cached(cache_key, symbol.file_path): - return self.cache[cache_key].data - - locations: List[Location] = [] - - if self.use_vscode_bridge: - # Legacy: VSCode Bridge HTTP mode - result = await self._request_vscode_bridge("get_references", { - "file_path": symbol.file_path, - "line": symbol.range.start_line, - "character": symbol.range.start_character, - }) - - # Don't cache on connection error (result is None) - if result is None: - return locations - - if isinstance(result, list): - for item in result: - try: - locations.append(Location.from_lsp_response(item)) - except (KeyError, TypeError): - continue - else: - # Default: Standalone mode - manager = await self._ensure_manager() - result = await manager.get_references( - file_path=symbol.file_path, - line=symbol.range.start_line, - character=symbol.range.start_character, - ) - - for item in result: - try: - locations.append(Location.from_lsp_response(item)) - except (KeyError, TypeError): - continue - - logger.debug( - "LSP references for %s (%s:%s:%s): %d", - symbol.id, - symbol.file_path, - symbol.range.start_line, - symbol.range.start_character, - len(locations), - ) - self._cache(cache_key, symbol.file_path, locations) - return locations - - async def get_definition(self, symbol: CodeSymbolNode) -> Optional[Location]: - """Get symbol definition location. - - Args: - symbol: The code symbol to find definition for - - Returns: - Location of the definition, or None if not found - """ - cache_key = f"def:{symbol.id}" - - if self._is_cached(cache_key, symbol.file_path): - return self.cache[cache_key].data - - location: Optional[Location] = None - - if self.use_vscode_bridge: - # Legacy: VSCode Bridge HTTP mode - result = await self._request_vscode_bridge("get_definition", { - "file_path": symbol.file_path, - "line": symbol.range.start_line, - "character": symbol.range.start_character, - }) - - if result: - if isinstance(result, list) and len(result) > 0: - try: - location = Location.from_lsp_response(result[0]) - except (KeyError, TypeError): - pass - elif isinstance(result, dict): - try: - location = Location.from_lsp_response(result) - except (KeyError, TypeError): - pass - else: - # Default: Standalone mode - manager = await self._ensure_manager() - result = await manager.get_definition( - file_path=symbol.file_path, - line=symbol.range.start_line, - character=symbol.range.start_character, - ) - - if result: - try: - location = Location.from_lsp_response(result) - except (KeyError, TypeError): - pass - - self._cache(cache_key, symbol.file_path, location) - return location - - async def get_call_hierarchy(self, symbol: CodeSymbolNode) -> List[CallHierarchyItem]: - """Get incoming/outgoing calls for a symbol. - - If call hierarchy is not supported by the language server, - falls back to using references. - - Args: - symbol: The code symbol to get call hierarchy for - - Returns: - List of CallHierarchyItem representing callers/callees. - Returns empty list on error or if not supported. - """ - cache_key = f"calls:{symbol.id}" - - if self._is_cached(cache_key, symbol.file_path): - return self.cache[cache_key].data - - items: List[CallHierarchyItem] = [] - - if self.use_vscode_bridge: - # Legacy: VSCode Bridge HTTP mode - result = await self._request_vscode_bridge("get_call_hierarchy", { - "file_path": symbol.file_path, - "line": symbol.range.start_line, - "character": symbol.range.start_character, - }) - - if result is None: - # Fallback: use references - refs = await self.get_references(symbol) - for ref in refs: - items.append(CallHierarchyItem( - name=f"caller@{ref.line}", - kind="reference", - file_path=ref.file_path, - range=Range( - start_line=ref.line, - start_character=ref.character, - end_line=ref.line, - end_character=ref.character, - ), - detail="Inferred from reference", - )) - elif isinstance(result, list): - for item in result: - try: - range_data = item.get("range", {}) - start = range_data.get("start", {}) - end = range_data.get("end", {}) - - items.append(CallHierarchyItem( - name=item.get("name", "unknown"), - kind=item.get("kind", "unknown"), - file_path=item.get("file_path", item.get("uri", "")), - range=Range( - start_line=start.get("line", 0) + 1, - start_character=start.get("character", 0) + 1, - end_line=end.get("line", 0) + 1, - end_character=end.get("character", 0) + 1, - ), - detail=item.get("detail"), - )) - except (KeyError, TypeError): - continue - else: - # Default: Standalone mode - manager = await self._ensure_manager() - - # Try to get call hierarchy items - hierarchy_items = await manager.get_call_hierarchy_items( - file_path=symbol.file_path, - line=symbol.range.start_line, - character=symbol.range.start_character, - ) - - if hierarchy_items: - # Get incoming calls for each item - for h_item in hierarchy_items: - incoming = await manager.get_incoming_calls(h_item) - for call in incoming: - from_item = call.get("from", {}) - range_data = from_item.get("range", {}) - start = range_data.get("start", {}) - end = range_data.get("end", {}) - - # Parse URI - uri = from_item.get("uri", "") - if uri.startswith("file://"): - raw = unquote(uri[7:]) # keep leading slash for Unix paths - if raw.startswith("/") and len(raw) > 2 and raw[2] == ":": - raw = raw[1:] - fp = raw - else: - fp = uri - - items.append(CallHierarchyItem( - name=from_item.get("name", "unknown"), - kind=str(from_item.get("kind", "unknown")), - file_path=fp, - range=Range( - start_line=start.get("line", 0) + 1, - start_character=start.get("character", 0) + 1, - end_line=end.get("line", 0) + 1, - end_character=end.get("character", 0) + 1, - ), - detail=from_item.get("detail"), - )) - else: - # Fallback: use references - refs = await self.get_references(symbol) - for ref in refs: - items.append(CallHierarchyItem( - name=f"caller@{ref.line}", - kind="reference", - file_path=ref.file_path, - range=Range( - start_line=ref.line, - start_character=ref.character, - end_line=ref.line, - end_character=ref.character, - ), - detail="Inferred from reference", - )) - - logger.debug( - "LSP call hierarchy for %s (%s:%s:%s): %d", - symbol.id, - symbol.file_path, - symbol.range.start_line, - symbol.range.start_character, - len(items), - ) - self._cache(cache_key, symbol.file_path, items) - return items - - async def get_document_symbols(self, file_path: str) -> List[Dict[str, Any]]: - """Get all symbols in a document (batch operation). - - This is more efficient than individual hover queries when processing - multiple locations in the same file. - - Args: - file_path: Path to the source file - - Returns: - List of symbol dictionaries with name, kind, range, etc. - Returns empty list on error or timeout. - """ - cache_key = f"symbols:{file_path}" - - if self._is_cached(cache_key, file_path): - return self.cache[cache_key].data - - symbols: List[Dict[str, Any]] = [] - - if self.use_vscode_bridge: - # Legacy: VSCode Bridge HTTP mode - result = await self._request_vscode_bridge("get_document_symbols", { - "file_path": file_path, - }) - - if isinstance(result, list): - symbols = self._flatten_document_symbols(result) - else: - # Default: Standalone mode - manager = await self._ensure_manager() - result = await manager.get_document_symbols(file_path) - - if result: - symbols = self._flatten_document_symbols(result) - - self._cache(cache_key, file_path, symbols) - return symbols - - def _flatten_document_symbols( - self, symbols: List[Dict[str, Any]], parent_name: str = "" - ) -> List[Dict[str, Any]]: - """Flatten nested document symbols into a flat list. - - Document symbols can be nested (e.g., methods inside classes). - This flattens them for easier lookup by line number. - - Args: - symbols: List of symbol dictionaries (may be nested) - parent_name: Name of parent symbol for qualification - - Returns: - Flat list of all symbols with their ranges - """ - flat: List[Dict[str, Any]] = [] - - for sym in symbols: - # Add the symbol itself - symbol_entry = { - "name": sym.get("name", "unknown"), - "kind": self._symbol_kind_to_string(sym.get("kind", 0)), - "range": sym.get("range", sym.get("location", {}).get("range", {})), - "selection_range": sym.get("selectionRange", {}), - "detail": sym.get("detail", ""), - "parent": parent_name, - } - flat.append(symbol_entry) - - # Recursively process children - children = sym.get("children", []) - if children: - qualified_name = sym.get("name", "") - if parent_name: - qualified_name = f"{parent_name}.{qualified_name}" - flat.extend(self._flatten_document_symbols(children, qualified_name)) - - return flat - - def _symbol_kind_to_string(self, kind: int) -> str: - """Convert LSP SymbolKind integer to string. - - Args: - kind: LSP SymbolKind enum value - - Returns: - Human-readable string representation - """ - # LSP SymbolKind enum (1-indexed) - kinds = { - 1: "file", - 2: "module", - 3: "namespace", - 4: "package", - 5: "class", - 6: "method", - 7: "property", - 8: "field", - 9: "constructor", - 10: "enum", - 11: "interface", - 12: "function", - 13: "variable", - 14: "constant", - 15: "string", - 16: "number", - 17: "boolean", - 18: "array", - 19: "object", - 20: "key", - 21: "null", - 22: "enum_member", - 23: "struct", - 24: "event", - 25: "operator", - 26: "type_parameter", - } - return kinds.get(kind, "unknown") - - async def get_hover(self, symbol: CodeSymbolNode) -> Optional[str]: - """Get hover documentation for a symbol. - - Args: - symbol: The code symbol to get hover info for - - Returns: - Hover documentation as string, or None if not available - """ - cache_key = f"hover:{symbol.id}" - - if self._is_cached(cache_key, symbol.file_path): - return self.cache[cache_key].data - - hover_text: Optional[str] = None - - if self.use_vscode_bridge: - # Legacy: VSCode Bridge HTTP mode - result = await self._request_vscode_bridge("get_hover", { - "file_path": symbol.file_path, - "line": symbol.range.start_line, - "character": symbol.range.start_character, - }) - - if result: - hover_text = self._parse_hover_result(result) - else: - # Default: Standalone mode - manager = await self._ensure_manager() - hover_text = await manager.get_hover( - file_path=symbol.file_path, - line=symbol.range.start_line, - character=symbol.range.start_character, - ) - - self._cache(cache_key, symbol.file_path, hover_text) - return hover_text - - def _parse_hover_result(self, result: Any) -> Optional[str]: - """Parse hover result into string.""" - if isinstance(result, str): - return result - elif isinstance(result, list): - parts = [] - for item in result: - if isinstance(item, str): - parts.append(item) - elif isinstance(item, dict): - value = item.get("value", item.get("contents", "")) - if value: - parts.append(str(value)) - return "\n\n".join(parts) if parts else None - elif isinstance(result, dict): - contents = result.get("contents", result.get("value", "")) - if isinstance(contents, str): - return contents - elif isinstance(contents, list): - parts = [] - for c in contents: - if isinstance(c, str): - parts.append(c) - elif isinstance(c, dict): - parts.append(str(c.get("value", ""))) - return "\n\n".join(parts) if parts else None - return None - - async def __aenter__(self) -> "LspBridge": - """Async context manager entry.""" - return self - - async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: - """Async context manager exit - close connections.""" - await self.close() - - -# Simple test -if __name__ == "__main__": - import sys - - async def test_lsp_bridge(): - """Simple test of LspBridge functionality.""" - print("Testing LspBridge (Standalone Mode)...") - print(f"Timeout: {LspBridge.DEFAULT_TIMEOUT}s") - print(f"Cache TTL: {LspBridge.DEFAULT_CACHE_TTL}s") - print() - - # Create a test symbol pointing to this file - test_file = os.path.abspath(__file__) - test_symbol = CodeSymbolNode( - id=f"{test_file}:LspBridge:96", - name="LspBridge", - kind="class", - file_path=test_file, - range=Range( - start_line=96, - start_character=1, - end_line=200, - end_character=1, - ), - ) - - print(f"Test symbol: {test_symbol.name} in {os.path.basename(test_symbol.file_path)}") - print() - - # Use standalone mode (default) - async with LspBridge( - workspace_root=str(Path(__file__).parent.parent.parent.parent), - ) as bridge: - print("1. Testing get_document_symbols...") - try: - symbols = await bridge.get_document_symbols(test_file) - print(f" Found {len(symbols)} symbols") - for sym in symbols[:5]: - print(f" - {sym.get('name')} ({sym.get('kind')})") - except Exception as e: - print(f" Error: {e}") - - print() - print("2. Testing get_definition...") - try: - definition = await bridge.get_definition(test_symbol) - if definition: - print(f" Definition: {os.path.basename(definition.file_path)}:{definition.line}") - else: - print(" No definition found") - except Exception as e: - print(f" Error: {e}") - - print() - print("3. Testing get_references...") - try: - refs = await bridge.get_references(test_symbol) - print(f" Found {len(refs)} references") - for ref in refs[:3]: - print(f" - {os.path.basename(ref.file_path)}:{ref.line}") - except Exception as e: - print(f" Error: {e}") - - print() - print("4. Testing get_hover...") - try: - hover = await bridge.get_hover(test_symbol) - if hover: - print(f" Hover: {hover[:100]}...") - else: - print(" No hover info found") - except Exception as e: - print(f" Error: {e}") - - print() - print("5. Testing get_call_hierarchy...") - try: - calls = await bridge.get_call_hierarchy(test_symbol) - print(f" Found {len(calls)} call hierarchy items") - for call in calls[:3]: - print(f" - {call.name} in {os.path.basename(call.file_path)}") - except Exception as e: - print(f" Error: {e}") - - print() - print("6. Testing cache...") - print(f" Cache entries: {len(bridge.cache)}") - for key in list(bridge.cache.keys())[:5]: - print(f" - {key}") - - print() - print("Test complete!") - - # Run the test - # Note: On Windows, use default ProactorEventLoop (supports subprocess creation) - - asyncio.run(test_lsp_bridge()) diff --git a/codex-lens/src/codexlens/lsp/lsp_graph_builder.py b/codex-lens/src/codexlens/lsp/lsp_graph_builder.py deleted file mode 100644 index 446fa2c7..00000000 --- a/codex-lens/src/codexlens/lsp/lsp_graph_builder.py +++ /dev/null @@ -1,383 +0,0 @@ -"""Graph builder for code association graphs via LSP.""" - -from __future__ import annotations - -import asyncio -import logging -from typing import Any, Dict, List, Optional, Set, Tuple - -from codexlens.hybrid_search.data_structures import ( - CallHierarchyItem, - CodeAssociationGraph, - CodeSymbolNode, - Range, -) -from codexlens.lsp.lsp_bridge import ( - Location, - LspBridge, -) - -logger = logging.getLogger(__name__) - - -class LspGraphBuilder: - """Builds code association graph by expanding from seed symbols using LSP.""" - - def __init__( - self, - max_depth: int = 2, - max_nodes: int = 100, - max_concurrent: int = 10, - resolve_symbols: bool = True, - ): - """Initialize GraphBuilder. - - Args: - max_depth: Maximum depth for BFS expansion from seeds. - max_nodes: Maximum number of nodes in the graph. - max_concurrent: Maximum concurrent LSP requests. - resolve_symbols: If False, skip documentSymbol lookups and create lightweight nodes. - """ - self.max_depth = max_depth - self.max_nodes = max_nodes - self.max_concurrent = max_concurrent - self.resolve_symbols = resolve_symbols - # Cache for document symbols per file (avoids per-location hover queries) - self._document_symbols_cache: Dict[str, List[Dict[str, Any]]] = {} - - async def build_from_seeds( - self, - seeds: List[CodeSymbolNode], - lsp_bridge: LspBridge, - ) -> CodeAssociationGraph: - """Build association graph by BFS expansion from seeds. - - For each seed: - 1. Get references via LSP - 2. Get call hierarchy via LSP - 3. Add nodes and edges to graph - 4. Continue expanding until max_depth or max_nodes reached - - Args: - seeds: Initial seed symbols to expand from. - lsp_bridge: LSP bridge for querying language servers. - - Returns: - CodeAssociationGraph with expanded nodes and relationships. - """ - graph = CodeAssociationGraph() - visited: Set[str] = set() - semaphore = asyncio.Semaphore(self.max_concurrent) - - # Initialize queue with seeds at depth 0 - queue: List[Tuple[CodeSymbolNode, int]] = [(s, 0) for s in seeds] - - # Add seed nodes to graph - for seed in seeds: - graph.add_node(seed) - - # BFS expansion - while queue and len(graph.nodes) < self.max_nodes: - # Take a batch of nodes from queue - batch_size = min(self.max_concurrent, len(queue)) - batch = queue[:batch_size] - queue = queue[batch_size:] - - # Expand nodes in parallel - tasks = [ - self._expand_node( - node, depth, graph, lsp_bridge, visited, semaphore - ) - for node, depth in batch - ] - - results = await asyncio.gather(*tasks, return_exceptions=True) - - # Process results and add new nodes to queue - for result in results: - if isinstance(result, Exception): - logger.warning("Error expanding node: %s", result) - continue - if result: - # Add new nodes to queue if not at max depth - for new_node, new_depth in result: - if ( - new_depth <= self.max_depth - and len(graph.nodes) < self.max_nodes - ): - queue.append((new_node, new_depth)) - - return graph - - async def _expand_node( - self, - node: CodeSymbolNode, - depth: int, - graph: CodeAssociationGraph, - lsp_bridge: LspBridge, - visited: Set[str], - semaphore: asyncio.Semaphore, - ) -> List[Tuple[CodeSymbolNode, int]]: - """Expand a single node, return new nodes to process. - - Args: - node: Node to expand. - depth: Current depth in BFS. - graph: Graph to add nodes and edges to. - lsp_bridge: LSP bridge for queries. - visited: Set of visited node IDs. - semaphore: Semaphore for concurrency control. - - Returns: - List of (new_node, new_depth) tuples to add to queue. - """ - # Skip if already visited or at max depth - if node.id in visited: - return [] - # Depth is 0 for seeds. To limit expansion to N hops from seeds, - # we expand nodes with depth < max_depth. - if depth >= self.max_depth: - visited.add(node.id) - return [] - if len(graph.nodes) >= self.max_nodes: - return [] - - visited.add(node.id) - new_nodes: List[Tuple[CodeSymbolNode, int]] = [] - - async with semaphore: - # Get relationships in parallel - try: - refs_task = lsp_bridge.get_references(node) - calls_task = lsp_bridge.get_call_hierarchy(node) - - refs, calls = await asyncio.gather( - refs_task, calls_task, return_exceptions=True - ) - - # Handle reference results - if isinstance(refs, Exception): - logger.debug( - "Failed to get references for %s: %s", node.id, refs - ) - refs = [] - - # Handle call hierarchy results - if isinstance(calls, Exception): - logger.debug( - "Failed to get call hierarchy for %s: %s", - node.id, - calls, - ) - calls = [] - - # Process references - for ref in refs: - if len(graph.nodes) >= self.max_nodes: - break - - ref_node = await self._location_to_node(ref, lsp_bridge) - if ref_node and ref_node.id != node.id: - if ref_node.id not in graph.nodes: - graph.add_node(ref_node) - new_nodes.append((ref_node, depth + 1)) - # Use add_edge since both nodes should exist now - graph.add_edge(node.id, ref_node.id, "references") - - # Process call hierarchy (incoming calls) - for call in calls: - if len(graph.nodes) >= self.max_nodes: - break - - call_node = await self._call_hierarchy_to_node( - call, lsp_bridge - ) - if call_node and call_node.id != node.id: - if call_node.id not in graph.nodes: - graph.add_node(call_node) - new_nodes.append((call_node, depth + 1)) - # Incoming call: call_node calls node - graph.add_edge(call_node.id, node.id, "calls") - - except Exception as e: - logger.warning( - "Error during node expansion for %s: %s", node.id, e - ) - - return new_nodes - - def clear_cache(self) -> None: - """Clear the document symbols cache. - - Call this between searches to free memory and ensure fresh data. - """ - self._document_symbols_cache.clear() - - async def _get_symbol_at_location( - self, - file_path: str, - line: int, - lsp_bridge: LspBridge, - ) -> Optional[Dict[str, Any]]: - """Find symbol at location using cached document symbols. - - This is much more efficient than individual hover queries because - document symbols are fetched once per file and cached. - - Args: - file_path: Path to the source file. - line: Line number (1-based). - lsp_bridge: LSP bridge for fetching document symbols. - - Returns: - Symbol dictionary with name, kind, range, etc., or None if not found. - """ - # Get or fetch document symbols for this file - if file_path not in self._document_symbols_cache: - symbols = await lsp_bridge.get_document_symbols(file_path) - self._document_symbols_cache[file_path] = symbols - - symbols = self._document_symbols_cache[file_path] - - # Find symbol containing this line (best match = smallest range) - best_match: Optional[Dict[str, Any]] = None - best_range_size = float("inf") - - for symbol in symbols: - sym_range = symbol.get("range", {}) - start = sym_range.get("start", {}) - end = sym_range.get("end", {}) - - # LSP ranges are 0-based, our line is 1-based - start_line = start.get("line", 0) + 1 - end_line = end.get("line", 0) + 1 - - if start_line <= line <= end_line: - range_size = end_line - start_line - if range_size < best_range_size: - best_match = symbol - best_range_size = range_size - - return best_match - - async def _location_to_node( - self, - location: Location, - lsp_bridge: LspBridge, - ) -> Optional[CodeSymbolNode]: - """Convert LSP location to CodeSymbolNode. - - Uses cached document symbols instead of individual hover queries - for better performance. - - Args: - location: LSP location to convert. - lsp_bridge: LSP bridge for additional queries. - - Returns: - CodeSymbolNode or None if conversion fails. - """ - try: - file_path = location.file_path - start_line = location.line - - # Try to find symbol info from cached document symbols (fast) - symbol_info = None - if self.resolve_symbols: - symbol_info = await self._get_symbol_at_location( - file_path, start_line, lsp_bridge - ) - - if symbol_info: - name = symbol_info.get("name", f"symbol_L{start_line}") - kind = symbol_info.get("kind", "unknown") - - # Extract range from symbol if available - sym_range = symbol_info.get("range", {}) - start = sym_range.get("start", {}) - end = sym_range.get("end", {}) - - location_range = Range( - start_line=start.get("line", start_line - 1) + 1, - start_character=start.get("character", location.character - 1) + 1, - end_line=end.get("line", start_line - 1) + 1, - end_character=end.get("character", location.character - 1) + 1, - ) - else: - # Fallback to basic node without symbol info - name = f"symbol_L{start_line}" - kind = "unknown" - location_range = Range( - start_line=location.line, - start_character=location.character, - end_line=location.line, - end_character=location.character, - ) - - node_id = self._create_node_id(file_path, name, start_line) - - return CodeSymbolNode( - id=node_id, - name=name, - kind=kind, - file_path=file_path, - range=location_range, - docstring="", # Skip hover for performance - ) - - except Exception as e: - logger.debug("Failed to convert location to node: %s", e) - return None - - async def _call_hierarchy_to_node( - self, - call_item: CallHierarchyItem, - lsp_bridge: LspBridge, - ) -> Optional[CodeSymbolNode]: - """Convert CallHierarchyItem to CodeSymbolNode. - - Args: - call_item: Call hierarchy item to convert. - lsp_bridge: LSP bridge (unused, kept for API consistency). - - Returns: - CodeSymbolNode or None if conversion fails. - """ - try: - file_path = call_item.file_path - name = call_item.name - start_line = call_item.range.start_line - # CallHierarchyItem.kind is already a string - kind = call_item.kind - - node_id = self._create_node_id(file_path, name, start_line) - - return CodeSymbolNode( - id=node_id, - name=name, - kind=kind, - file_path=file_path, - range=call_item.range, - docstring=call_item.detail or "", - ) - - except Exception as e: - logger.debug( - "Failed to convert call hierarchy item to node: %s", e - ) - return None - - def _create_node_id( - self, file_path: str, name: str, line: int - ) -> str: - """Create unique node ID. - - Args: - file_path: Path to the file. - name: Symbol name. - line: Line number (0-based). - - Returns: - Unique node ID string. - """ - return f"{file_path}:{name}:{line}" diff --git a/codex-lens/src/codexlens/lsp/providers.py b/codex-lens/src/codexlens/lsp/providers.py deleted file mode 100644 index d0275437..00000000 --- a/codex-lens/src/codexlens/lsp/providers.py +++ /dev/null @@ -1,177 +0,0 @@ -"""LSP feature providers.""" - -from __future__ import annotations - -import logging -from dataclasses import dataclass -from pathlib import Path -from typing import Optional, TYPE_CHECKING - -if TYPE_CHECKING: - from codexlens.storage.global_index import GlobalSymbolIndex - from codexlens.storage.registry import RegistryStore - -logger = logging.getLogger(__name__) - - -@dataclass -class HoverInfo: - """Hover information for a symbol.""" - - name: str - kind: str - signature: str - documentation: Optional[str] - file_path: str - line_range: tuple # (start_line, end_line) - - -class HoverProvider: - """Provides hover information for symbols.""" - - def __init__( - self, - global_index: "GlobalSymbolIndex", - registry: Optional["RegistryStore"] = None, - ) -> None: - """Initialize hover provider. - - Args: - global_index: Global symbol index for lookups - registry: Optional registry store for index path resolution - """ - self.global_index = global_index - self.registry = registry - - def get_hover_info(self, symbol_name: str) -> Optional[HoverInfo]: - """Get hover information for a symbol. - - Args: - symbol_name: Name of the symbol to look up - - Returns: - HoverInfo or None if symbol not found - """ - # Look up symbol in global index using exact match - symbols = self.global_index.search( - name=symbol_name, - limit=1, - prefix_mode=False, - ) - - # Filter for exact name match - exact_matches = [s for s in symbols if s.name == symbol_name] - - if not exact_matches: - return None - - symbol = exact_matches[0] - - # Extract signature from source file - signature = self._extract_signature(symbol) - - # Symbol uses 'file' attribute and 'range' tuple - file_path = symbol.file or "" - start_line, end_line = symbol.range - - return HoverInfo( - name=symbol.name, - kind=symbol.kind, - signature=signature, - documentation=None, # Symbol doesn't have docstring field - file_path=file_path, - line_range=(start_line, end_line), - ) - - def _extract_signature(self, symbol) -> str: - """Extract function/class signature from source file. - - Args: - symbol: Symbol object with file and range information - - Returns: - Extracted signature string or fallback kind + name - """ - try: - file_path = Path(symbol.file) if symbol.file else None - if not file_path or not file_path.exists(): - return f"{symbol.kind} {symbol.name}" - - content = file_path.read_text(encoding="utf-8", errors="ignore") - lines = content.split("\n") - - # Extract signature lines (first line of definition + continuation) - start_line = symbol.range[0] - 1 # Convert 1-based to 0-based - if start_line >= len(lines) or start_line < 0: - return f"{symbol.kind} {symbol.name}" - - signature_lines = [] - first_line = lines[start_line] - signature_lines.append(first_line) - - # Continue if multiline signature (no closing paren + colon yet) - # Look for patterns like "def func(", "class Foo(", etc. - i = start_line + 1 - max_lines = min(start_line + 5, len(lines)) - while i < max_lines: - line = signature_lines[-1] - # Stop if we see closing pattern - if "):" in line or line.rstrip().endswith(":"): - break - signature_lines.append(lines[i]) - i += 1 - - return "\n".join(signature_lines) - - except Exception as e: - logger.debug(f"Failed to extract signature for {symbol.name}: {e}") - return f"{symbol.kind} {symbol.name}" - - def format_hover_markdown(self, info: HoverInfo) -> str: - """Format hover info as Markdown. - - Args: - info: HoverInfo object to format - - Returns: - Markdown-formatted hover content - """ - parts = [] - - # Detect language for code fence based on file extension - ext = Path(info.file_path).suffix.lower() if info.file_path else "" - lang_map = { - ".py": "python", - ".js": "javascript", - ".ts": "typescript", - ".tsx": "typescript", - ".jsx": "javascript", - ".java": "java", - ".go": "go", - ".rs": "rust", - ".c": "c", - ".cpp": "cpp", - ".h": "c", - ".hpp": "cpp", - ".cs": "csharp", - ".rb": "ruby", - ".php": "php", - } - lang = lang_map.get(ext, "") - - # Code block with signature - parts.append(f"```{lang}\n{info.signature}\n```") - - # Documentation if available - if info.documentation: - parts.append(f"\n---\n\n{info.documentation}") - - # Location info - file_name = Path(info.file_path).name if info.file_path else "unknown" - parts.append( - f"\n---\n\n*{info.kind}* defined in " - f"`{file_name}` " - f"(line {info.line_range[0]})" - ) - - return "\n".join(parts) diff --git a/codex-lens/src/codexlens/lsp/server.py b/codex-lens/src/codexlens/lsp/server.py deleted file mode 100644 index 809bba9e..00000000 --- a/codex-lens/src/codexlens/lsp/server.py +++ /dev/null @@ -1,263 +0,0 @@ -"""codex-lens LSP Server implementation using pygls. - -This module provides the main Language Server class and entry point. -""" - -from __future__ import annotations - -import argparse -import logging -import sys -from pathlib import Path -from typing import Optional - -try: - from lsprotocol import types as lsp - from pygls.lsp.server import LanguageServer -except ImportError as exc: - raise ImportError( - "LSP dependencies not installed. Install with: pip install codex-lens[lsp]" - ) from exc - -from codexlens.config import Config -from codexlens.search.chain_search import ChainSearchEngine -from codexlens.storage.global_index import GlobalSymbolIndex -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - -logger = logging.getLogger(__name__) - - -class CodexLensLanguageServer(LanguageServer): - """Language Server for codex-lens code indexing. - - Provides IDE features using codex-lens symbol index: - - Go to Definition - - Find References - - Code Completion - - Hover Information - - Workspace Symbol Search - - Attributes: - registry: Global project registry for path lookups - mapper: Path mapper for source/index conversions - global_index: Project-wide symbol index - search_engine: Chain search engine for symbol search - workspace_root: Current workspace root path - """ - - def __init__(self) -> None: - super().__init__(name="codexlens-lsp", version="0.1.0") - - self.registry: Optional[RegistryStore] = None - self.mapper: Optional[PathMapper] = None - self.global_index: Optional[GlobalSymbolIndex] = None - self.search_engine: Optional[ChainSearchEngine] = None - self.workspace_root: Optional[Path] = None - self._config: Optional[Config] = None - - def initialize_components(self, workspace_root: Path) -> bool: - """Initialize codex-lens components for the workspace. - - Args: - workspace_root: Root path of the workspace - - Returns: - True if initialization succeeded, False otherwise - """ - self.workspace_root = workspace_root.resolve() - logger.info("Initializing codex-lens for workspace: %s", self.workspace_root) - - try: - # Initialize registry - self.registry = RegistryStore() - self.registry.initialize() - - # Initialize path mapper - self.mapper = PathMapper() - - # Try to find project in registry - project_info = self.registry.find_by_source_path(str(self.workspace_root)) - - if project_info: - project_id = int(project_info["id"]) - index_root = Path(project_info["index_root"]) - - # Initialize global symbol index - global_db = index_root / GlobalSymbolIndex.DEFAULT_DB_NAME - self.global_index = GlobalSymbolIndex(global_db, project_id) - self.global_index.initialize() - - # Initialize search engine - self._config = Config() - self.search_engine = ChainSearchEngine( - registry=self.registry, - mapper=self.mapper, - config=self._config, - ) - - logger.info("codex-lens initialized for project: %s", project_info["source_root"]) - return True - else: - logger.warning( - "Workspace not indexed by codex-lens: %s. " - "Run 'codexlens index %s' to index first.", - self.workspace_root, - self.workspace_root, - ) - return False - - except Exception as exc: - logger.error("Failed to initialize codex-lens: %s", exc) - return False - - def shutdown_components(self) -> None: - """Clean up codex-lens components.""" - if self.global_index: - try: - self.global_index.close() - except Exception as exc: - logger.debug("Error closing global index: %s", exc) - self.global_index = None - - if self.search_engine: - try: - self.search_engine.close() - except Exception as exc: - logger.debug("Error closing search engine: %s", exc) - self.search_engine = None - - if self.registry: - try: - self.registry.close() - except Exception as exc: - logger.debug("Error closing registry: %s", exc) - self.registry = None - - -# Create server instance -server = CodexLensLanguageServer() - - -@server.feature(lsp.INITIALIZE) -def lsp_initialize(params: lsp.InitializeParams) -> lsp.InitializeResult: - """Handle LSP initialize request.""" - logger.info("LSP initialize request received") - - # Get workspace root - workspace_root: Optional[Path] = None - if params.root_uri: - workspace_root = Path(params.root_uri.replace("file://", "").replace("file:", "")) - elif params.root_path: - workspace_root = Path(params.root_path) - - if workspace_root: - server.initialize_components(workspace_root) - - # Declare server capabilities - return lsp.InitializeResult( - capabilities=lsp.ServerCapabilities( - text_document_sync=lsp.TextDocumentSyncOptions( - open_close=True, - change=lsp.TextDocumentSyncKind.Incremental, - save=lsp.SaveOptions(include_text=False), - ), - definition_provider=True, - references_provider=True, - completion_provider=lsp.CompletionOptions( - trigger_characters=[".", ":"], - resolve_provider=False, - ), - hover_provider=True, - workspace_symbol_provider=True, - ), - server_info=lsp.ServerInfo( - name="codexlens-lsp", - version="0.1.0", - ), - ) - - -@server.feature(lsp.SHUTDOWN) -def lsp_shutdown(params: None) -> None: - """Handle LSP shutdown request.""" - logger.info("LSP shutdown request received") - server.shutdown_components() - - -def main() -> int: - """Entry point for codexlens-lsp command. - - Returns: - Exit code (0 for success) - """ - # Import handlers to register them with the server - # This must be done before starting the server - import codexlens.lsp.handlers # noqa: F401 - - parser = argparse.ArgumentParser( - description="codex-lens Language Server", - prog="codexlens-lsp", - ) - parser.add_argument( - "--stdio", - action="store_true", - default=True, - help="Use stdio for communication (default)", - ) - parser.add_argument( - "--tcp", - action="store_true", - help="Use TCP for communication", - ) - parser.add_argument( - "--host", - default="127.0.0.1", - help="TCP host (default: 127.0.0.1)", - ) - parser.add_argument( - "--port", - type=int, - default=2087, - help="TCP port (default: 2087)", - ) - parser.add_argument( - "--log-level", - choices=["DEBUG", "INFO", "WARNING", "ERROR"], - default="INFO", - help="Log level (default: INFO)", - ) - parser.add_argument( - "--log-file", - help="Log file path (optional)", - ) - - args = parser.parse_args() - - # Configure logging - log_handlers = [] - if args.log_file: - log_handlers.append(logging.FileHandler(args.log_file)) - else: - log_handlers.append(logging.StreamHandler(sys.stderr)) - - logging.basicConfig( - level=getattr(logging, args.log_level), - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", - handlers=log_handlers, - ) - - logger.info("Starting codexlens-lsp server") - - if args.tcp: - logger.info("Starting TCP server on %s:%d", args.host, args.port) - server.start_tcp(args.host, args.port) - else: - logger.info("Starting stdio server") - server.start_io() - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/codex-lens/src/codexlens/lsp/standalone_manager.py b/codex-lens/src/codexlens/lsp/standalone_manager.py deleted file mode 100644 index d2a57de5..00000000 --- a/codex-lens/src/codexlens/lsp/standalone_manager.py +++ /dev/null @@ -1,1307 +0,0 @@ -"""Standalone Language Server Manager for direct LSP communication. - -This module provides direct communication with language servers via JSON-RPC over stdio, -eliminating the need for VSCode Bridge. Similar to cclsp architecture. - -Features: -- Direct subprocess spawning of language servers -- JSON-RPC 2.0 communication over stdin/stdout -- Multi-language support via configuration file (lsp-servers.json) -- Process lifecycle management with auto-restart -- Compatible interface with existing LspBridge -""" - -from __future__ import annotations - -import asyncio -import importlib.resources as resources -import json -import logging -import os -import sys -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple -from urllib.parse import unquote, urlparse - -logger = logging.getLogger(__name__) - - -@dataclass -class ServerConfig: - """Configuration for a language server.""" - - language_id: str - display_name: str - extensions: List[str] - command: List[str] - enabled: bool = True - initialization_options: Dict[str, Any] = field(default_factory=dict) - settings: Dict[str, Any] = field(default_factory=dict) - root_dir: str = "." - timeout: int = 30000 # ms - restart_interval: int = 5000 # ms - max_restarts: int = 3 - - -@dataclass -class ServerState: - """State of a running language server.""" - - config: ServerConfig - process: asyncio.subprocess.Process - reader: asyncio.StreamReader - writer: asyncio.StreamWriter - request_id: int = 0 - initialized: bool = False - capabilities: Dict[str, Any] = field(default_factory=dict) - pending_requests: Dict[int, asyncio.Future] = field(default_factory=dict) - restart_count: int = 0 - # Queue for producer-consumer pattern - continuous reading puts messages here - message_queue: asyncio.Queue = field(default_factory=asyncio.Queue) - # Track opened documents to avoid redundant didOpen spam (and unnecessary delays). - # Key: document URI -> (version, file_mtime) - opened_documents: Dict[str, Tuple[int, float]] = field(default_factory=dict) - opened_documents_lock: asyncio.Lock = field(default_factory=asyncio.Lock) - - -class StandaloneLspManager: - """Manager for direct language server communication. - - Spawns language servers as subprocesses and communicates via JSON-RPC - over stdin/stdout. No VSCode or GUI dependency required. - - Example: - manager = StandaloneLspManager(workspace_root="/path/to/project") - await manager.start() - - definition = await manager.get_definition( - file_path="src/main.py", - line=10, - character=5 - ) - - await manager.stop() - """ - - DEFAULT_CONFIG_FILE = "lsp-servers.json" - - def __init__( - self, - workspace_root: Optional[str] = None, - config_file: Optional[str] = None, - timeout: float = 30.0, - ): - """Initialize StandaloneLspManager. - - Args: - workspace_root: Root directory of the workspace (used for rootUri) - config_file: Path to lsp-servers.json configuration file - timeout: Default timeout for LSP requests in seconds - """ - self.workspace_root = Path(workspace_root or os.getcwd()).resolve() - self.config_file = config_file - self.timeout = timeout - - self._servers: Dict[str, ServerState] = {} # language_id -> ServerState - self._extension_map: Dict[str, str] = {} # extension -> language_id - self._configs: Dict[str, ServerConfig] = {} # language_id -> ServerConfig - self._read_tasks: Dict[str, asyncio.Task] = {} # language_id -> read task - self._stderr_tasks: Dict[str, asyncio.Task] = {} # language_id -> stderr read task - self._processor_tasks: Dict[str, asyncio.Task] = {} # language_id -> message processor task - self._lock = asyncio.Lock() - - def _find_config_file(self) -> Optional[Path]: - """Find the lsp-servers.json configuration file. - - Search order: - 1. Explicit config_file parameter - 2. {workspace_root}/lsp-servers.json - 3. {workspace_root}/.codexlens/lsp-servers.json - """ - search_paths = [] - - if self.config_file: - search_paths.append(Path(self.config_file)) - - search_paths.extend([ - self.workspace_root / self.DEFAULT_CONFIG_FILE, - self.workspace_root / ".codexlens" / self.DEFAULT_CONFIG_FILE, - ]) - - for path in search_paths: - if path.exists(): - return path - - return None - - def _load_builtin_config(self) -> Optional[dict[str, Any]]: - """Load the built-in default lsp-servers.json shipped with the package.""" - try: - text = ( - resources.files("codexlens.lsp") - .joinpath(self.DEFAULT_CONFIG_FILE) - .read_text(encoding="utf-8") - ) - except Exception as exc: - logger.error( - "Failed to load built-in %s template from package: %s", - self.DEFAULT_CONFIG_FILE, - exc, - ) - return None - - try: - return json.loads(text) - except Exception as exc: - logger.error( - "Built-in %s template shipped with the package is invalid JSON: %s", - self.DEFAULT_CONFIG_FILE, - exc, - ) - return None - - def _load_config(self) -> None: - """Load language server configuration from JSON file.""" - self._configs.clear() - self._extension_map.clear() - - config_path = self._find_config_file() - - if not config_path: - data = self._load_builtin_config() - if data is None: - logger.warning( - "No %s found and built-in defaults could not be loaded; using empty config", - self.DEFAULT_CONFIG_FILE, - ) - return - - root_config_path = self.workspace_root / self.DEFAULT_CONFIG_FILE - codexlens_config_path = ( - self.workspace_root / ".codexlens" / self.DEFAULT_CONFIG_FILE - ) - - logger.info( - "No %s found at %s or %s; using built-in defaults shipped with codex-lens. " - "To customize, copy the template to one of those locations and restart. " - "Language servers are spawned on-demand when first needed. " - "Ensure the language server commands in the config are installed and on PATH.", - self.DEFAULT_CONFIG_FILE, - root_config_path, - codexlens_config_path, - ) - config_source = "built-in defaults" - else: - try: - with open(config_path, "r", encoding="utf-8") as f: - data = json.load(f) - except Exception as e: - logger.error(f"Failed to load config from {config_path}: {e}") - return - - config_source = str(config_path) - - # Parse defaults - defaults = data.get("defaults", {}) - default_timeout = defaults.get("timeout", 30000) - default_restart_interval = defaults.get("restartInterval", 5000) - default_max_restarts = defaults.get("maxRestarts", 3) - - # Parse servers - for server_data in data.get("servers", []): - if not server_data.get("enabled", True): - continue - - language_id = server_data.get("languageId", "") - if not language_id: - continue - - config = ServerConfig( - language_id=language_id, - display_name=server_data.get("displayName", language_id), - extensions=server_data.get("extensions", []), - command=server_data.get("command", []), - enabled=server_data.get("enabled", True), - initialization_options=server_data.get("initializationOptions", {}), - settings=server_data.get("settings", {}), - root_dir=server_data.get("rootDir", defaults.get("rootDir", ".")), - timeout=server_data.get("timeout", default_timeout), - restart_interval=server_data.get("restartInterval", default_restart_interval), - max_restarts=server_data.get("maxRestarts", default_max_restarts), - ) - - self._configs[language_id] = config - - # Build extension map - for ext in config.extensions: - self._extension_map[ext.lower()] = language_id - - logger.info( - "Loaded %d language server configs from %s", - len(self._configs), - config_source, - ) - - def get_language_id(self, file_path: str) -> Optional[str]: - """Get language ID for a file based on extension. - - Args: - file_path: Path to the file - - Returns: - Language ID (e.g., "python", "typescript") or None if unknown - """ - ext = Path(file_path).suffix.lstrip(".").lower() - return self._extension_map.get(ext) - - async def start(self) -> None: - """Initialize the manager and load configuration. - - This does NOT start any language servers yet - they are started - on-demand when first needed for a file type. - """ - self._load_config() - logger.info(f"StandaloneLspManager started for workspace: {self.workspace_root}") - - async def stop(self) -> None: - """Stop all running language servers and cleanup.""" - async with self._lock: - for language_id in list(self._servers.keys()): - await self._stop_server(language_id) - - logger.info("StandaloneLspManager stopped") - - async def _start_server(self, language_id: str) -> Optional[ServerState]: - """Start a language server for the given language. - - Args: - language_id: The language ID (e.g., "python") - - Returns: - ServerState if successful, None on failure - """ - config = self._configs.get(language_id) - if not config: - logger.error(f"No configuration for language: {language_id}") - return None - - if not config.command: - logger.error(f"No command specified for {language_id}") - return None - - try: - logger.info(f"Starting {config.display_name}: {' '.join(config.command)}") - - # Spawn the language server process - process = await asyncio.create_subprocess_exec( - *config.command, - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - cwd=str(self.workspace_root), - ) - - if process.stdin is None or process.stdout is None: - logger.error(f"Failed to get stdin/stdout for {language_id}") - process.terminate() - return None - - state = ServerState( - config=config, - process=process, - reader=process.stdout, - writer=process.stdin, - ) - - self._servers[language_id] = state - - # Start reading stderr in background (prevents pipe buffer from filling up) - if process.stderr: - self._stderr_tasks[language_id] = asyncio.create_task( - self._read_stderr(language_id, process.stderr) - ) - - # CRITICAL: Start the continuous reader task IMMEDIATELY before any communication - # This ensures no messages are lost during initialization handshake - self._read_tasks[language_id] = asyncio.create_task( - self._continuous_reader(language_id) - ) - - # Start the message processor task to handle queued messages - self._processor_tasks[language_id] = asyncio.create_task(self._process_messages(language_id)) - - # Initialize the server - now uses queue for reading responses - await self._initialize_server(state) - - logger.info(f"{config.display_name} started and initialized") - return state - - except FileNotFoundError: - logger.error( - f"Language server not found: {config.command[0]}. " - f"Install it with the appropriate package manager." - ) - return None - except Exception as e: - logger.error(f"Failed to start {language_id}: {e}") - return None - - async def _stop_server(self, language_id: str) -> None: - """Stop a language server.""" - state = self._servers.pop(language_id, None) - if not state: - return - - # Cancel read task - task = self._read_tasks.pop(language_id, None) - if task: - task.cancel() - try: - await task - except asyncio.CancelledError: - pass - - # Cancel stderr task - stderr_task = self._stderr_tasks.pop(language_id, None) - if stderr_task: - stderr_task.cancel() - try: - await stderr_task - except asyncio.CancelledError: - pass - - # Cancel message processor task - processor_task = self._processor_tasks.pop(language_id, None) - if processor_task: - processor_task.cancel() - try: - await processor_task - except asyncio.CancelledError: - pass - - # Send shutdown request - try: - await self._send_request(state, "shutdown", None, timeout=5.0) - except Exception: - pass - - # Send exit notification - try: - await self._send_notification(state, "exit", None) - except Exception: - pass - - # Terminate process - if state.process.returncode is None: - state.process.terminate() - try: - await asyncio.wait_for(state.process.wait(), timeout=5.0) - except asyncio.TimeoutError: - state.process.kill() - - logger.info(f"Stopped {state.config.display_name}") - - async def _get_server(self, file_path: str) -> Optional[ServerState]: - """Get or start the appropriate language server for a file. - - Args: - file_path: Path to the file being operated on - - Returns: - ServerState for the appropriate language server, or None - """ - file_path = self._normalize_file_path(file_path) - language_id = self.get_language_id(file_path) - if not language_id: - logger.debug(f"No language server configured for: {file_path}") - return None - - async with self._lock: - if language_id in self._servers: - state = self._servers[language_id] - # Check if process is still running - if state.process.returncode is None: - return state - # Process died, remove it - del self._servers[language_id] - - # Start new server - return await self._start_server(language_id) - - def _normalize_file_path(self, file_path_or_uri: str) -> str: - """Normalize a file path that may be an LSP file URI or URI-path. - - LSP responses often contain `file://` URIs with percent-encoding - (e.g. `file:///d%3A/...`). Some code paths may forward the parsed - URI path (`/d%3A/...`) without the scheme. On Windows, `Path(...)` - would interpret that as a root path on the current drive, producing - invalid paths like `D:\\d%3A\\...`. - """ - if not file_path_or_uri: - return file_path_or_uri - - raw = str(file_path_or_uri).strip() - - if raw.startswith("file:"): - try: - parsed = urlparse(raw) - if parsed.scheme == "file": - raw = unquote(parsed.path) - else: - raw = raw.replace("file:///", "").replace("file://", "") - except Exception: - raw = raw.replace("file:///", "").replace("file://", "") - - # Decode percent-encoded segments (e.g. d%3A -> d:) - if "%3a" in raw.lower(): - try: - raw = unquote(raw) - except Exception: - pass - - # Windows: file URI paths frequently look like "/C:/path"; strip the extra slash. - if raw.startswith("/") and len(raw) > 2 and raw[2] == ":": - raw = raw[1:] - - return raw - - async def _initialize_server(self, state: ServerState) -> None: - """Send initialize request and wait for response via the message queue. - - The continuous reader and message processor are already running, so we just - send the request and wait for the response via pending_requests. - """ - root_uri = self.workspace_root.as_uri() - - # Simplified params matching direct test that works - params = { - "processId": None, # Use None like direct test - "rootUri": root_uri, - "rootPath": str(self.workspace_root), - "capabilities": { - "textDocument": { - "documentSymbol": { - "hierarchicalDocumentSymbolSupport": True, - }, - }, - "workspace": { - "configuration": True, - }, - }, - "workspaceFolders": [ - { - "uri": root_uri, - "name": self.workspace_root.name, - } - ], - } - - # Send initialize request and wait for response via queue - state.request_id += 1 - init_request_id = state.request_id - - # Create future for the response - future: asyncio.Future = asyncio.get_event_loop().create_future() - state.pending_requests[init_request_id] = future - - # Send the request - init_message = { - "jsonrpc": "2.0", - "id": init_request_id, - "method": "initialize", - "params": params, - } - encoded = self._encode_message(init_message) - logger.debug(f"Sending initialize request id={init_request_id}") - state.writer.write(encoded) - await state.writer.drain() - - # Wait for response (will be routed by _process_messages) - try: - init_result = await asyncio.wait_for(future, timeout=30.0) - except asyncio.TimeoutError: - state.pending_requests.pop(init_request_id, None) - raise RuntimeError("Initialize request timed out") - - if init_result is None: - init_result = {} - - # Store capabilities - state.capabilities = init_result.get("capabilities", {}) - state.initialized = True - logger.debug(f"Initialize response received, capabilities: {len(state.capabilities)} keys") - - # Send initialized notification - await self._send_notification(state, "initialized", {}) - - # Give time for server to process initialized and send any requests - # The message processor will handle workspace/configuration automatically - await asyncio.sleep(0.5) - - def _encode_message(self, content: Dict[str, Any]) -> bytes: - """Encode a JSON-RPC message with LSP headers.""" - body = json.dumps(content).encode("utf-8") - header = f"Content-Length: {len(body)}\r\n\r\n" - return header.encode("ascii") + body - - async def _read_message(self, reader: asyncio.StreamReader) -> Tuple[Optional[Dict[str, Any]], bool]: - """Read a JSON-RPC message from the stream. - - Returns: - Tuple of (message, stream_closed). If stream_closed is True, the reader loop - should exit. If False and message is None, it was just a timeout. - """ - try: - # Read headers - content_length = 0 - while True: - try: - line = await asyncio.wait_for(reader.readline(), timeout=1.0) - except asyncio.TimeoutError: - # Timeout is not an error - just no message available yet - return None, False - - if not line: - # Empty read means stream closed - return None, True - - line_str = line.decode("ascii").strip() - if line_str: # Only log non-empty lines - logger.debug(f"Read header line: {repr(line_str[:80])}") - if not line_str: - break # Empty line = end of headers - - if line_str.lower().startswith("content-length:"): - content_length = int(line_str.split(":")[1].strip()) - - if content_length == 0: - return None, False - - # Read body - body = await reader.readexactly(content_length) - return json.loads(body.decode("utf-8")), False - - except asyncio.IncompleteReadError: - return None, True - except Exception as e: - logger.error(f"Error reading message: {e}") - return None, True - - async def _continuous_reader(self, language_id: str) -> None: - """Continuously read messages from language server and put them in the queue. - - This is the PRODUCER in the producer-consumer pattern. It starts IMMEDIATELY - after subprocess creation and runs continuously until shutdown. This ensures - no messages are ever lost, even during initialization handshake. - """ - state = self._servers.get(language_id) - if not state: - return - - logger.debug(f"Continuous reader started for {language_id}") - - try: - while True: - try: - # Read headers with timeout - content_length = 0 - while True: - try: - line = await asyncio.wait_for(state.reader.readline(), timeout=5.0) - except asyncio.TimeoutError: - continue # Keep waiting for data - - if not line: - logger.debug(f"Continuous reader for {language_id}: EOF") - return - - line_str = line.decode("ascii").strip() - if not line_str: - break # End of headers - - if line_str.lower().startswith("content-length:"): - content_length = int(line_str.split(":")[1].strip()) - - if content_length == 0: - continue - - # Read body - body = await state.reader.readexactly(content_length) - message = json.loads(body.decode("utf-8")) - - # Put message in queue for processing - await state.message_queue.put(message) - - msg_id = message.get("id", "none") - msg_method = message.get("method", "none") - logger.debug(f"Queued message: id={msg_id}, method={msg_method}") - - except asyncio.IncompleteReadError: - logger.debug(f"Continuous reader for {language_id}: IncompleteReadError") - return - except Exception as e: - logger.error(f"Error in continuous reader for {language_id}: {e}") - await asyncio.sleep(0.1) - - except asyncio.CancelledError: - logger.debug(f"Continuous reader cancelled for {language_id}") - except Exception as e: - logger.error(f"Fatal error in continuous reader for {language_id}: {e}") - - async def _process_messages(self, language_id: str) -> None: - """Process messages from the queue and route them appropriately. - - This is the CONSUMER in the producer-consumer pattern. It handles: - - Server requests (workspace/configuration, etc.) - responds immediately - - Notifications (window/logMessage, etc.) - logs them - - Responses to our requests are NOT handled here - they're consumed by _wait_for_response - """ - state = self._servers.get(language_id) - if not state: - return - - logger.debug(f"Message processor started for {language_id}") - - try: - while True: - # Get message from queue (blocks until available) - message = await state.message_queue.get() - - msg_id = message.get("id") - method = message.get("method", "") - - # Response (has id but no method) - put back for _wait_for_response to consume - if msg_id is not None and not method: - # This is a response to one of our requests - if msg_id in state.pending_requests: - future = state.pending_requests.pop(msg_id) - if "error" in message: - future.set_exception( - Exception(message["error"].get("message", "Unknown error")) - ) - else: - future.set_result(message.get("result")) - logger.debug(f"Response routed to pending request id={msg_id}") - else: - logger.debug(f"No pending request for response id={msg_id}") - - # Server request (has both id and method) - needs response - elif msg_id is not None and method: - logger.info(f"Server request: {method} (id={msg_id})") - await self._handle_server_request(state, message) - - # Notification (has method but no id) - elif method: - self._handle_server_message(language_id, message) - - state.message_queue.task_done() - - except asyncio.CancelledError: - logger.debug(f"Message processor cancelled for {language_id}") - except Exception as e: - logger.error(f"Error in message processor for {language_id}: {e}") - - async def _read_stderr(self, language_id: str, stderr: asyncio.StreamReader) -> None: - """Background task to read stderr from a language server. - - This prevents the stderr pipe buffer from filling up, which would - cause the language server process to block and stop responding. - """ - try: - while True: - line = await stderr.readline() - if not line: - break - text = line.decode("utf-8", errors="replace").rstrip() - if text: - # Log stderr output at warning level for visibility - logger.warning(f"[{language_id}] {text}") - except asyncio.CancelledError: - pass - except Exception as e: - logger.debug(f"Error reading stderr for {language_id}: {e}") - - def _handle_server_message(self, language_id: str, message: Dict[str, Any]) -> None: - """Handle notifications from the language server.""" - method = message.get("method", "") - params = message.get("params", {}) - - if method == "window/logMessage": - level = params.get("type", 4) # 1=error, 2=warn, 3=info, 4=log - text = params.get("message", "") - if level == 1: - logger.error(f"[{language_id}] {text}") - elif level == 2: - logger.warning(f"[{language_id}] {text}") - else: - logger.debug(f"[{language_id}] {text}") - - elif method == "window/showMessage": - text = params.get("message", "") - logger.info(f"[{language_id}] {text}") - - async def _handle_server_request(self, state: ServerState, message: Dict[str, Any]) -> None: - """Handle requests from the language server that need a response.""" - request_id = message["id"] - method = message.get("method", "") - params = message.get("params", {}) - - logger.info(f"SERVER REQUEST: {method} (id={request_id}) params={params}") - - result = None - - if method == "workspace/configuration": - # Return configuration items for each requested scope - items = params.get("items", []) - result = [] - for item in items: - section = item.get("section", "") - # Provide Python-specific settings for pyright - if section == "python": - result.append({ - "pythonPath": "python", - "analysis": { - "autoSearchPaths": True, - "useLibraryCodeForTypes": True, - "diagnosticMode": "workspace", - } - }) - elif section == "python.analysis": - result.append({ - "autoSearchPaths": True, - "useLibraryCodeForTypes": True, - "diagnosticMode": "workspace", - "typeCheckingMode": "basic", - }) - else: - # Return empty object for unknown sections - result.append({}) - sections = [item.get("section", "") for item in items] - logger.info(f"Responding to workspace/configuration with {len(result)} items for sections: {sections}") - - elif method == "client/registerCapability": - # Accept capability registration - result = None - - elif method == "window/workDoneProgress/create": - # Accept progress token creation - result = None - - else: - logger.debug(f"Unhandled server request: {method}") - - # Send response - response = { - "jsonrpc": "2.0", - "id": request_id, - "result": result, - } - try: - encoded = self._encode_message(response) - state.writer.write(encoded) - await state.writer.drain() - logger.debug(f"Sent response to server request {method} (id={request_id})") - except Exception as e: - logger.error(f"Failed to respond to server request {method}: {e}") - - async def _send_request( - self, - state: ServerState, - method: str, - params: Optional[Dict[str, Any]], - timeout: Optional[float] = None, - ) -> Any: - """Send a request to the language server and wait for response. - - Args: - state: Server state - method: LSP method name (e.g., "textDocument/definition") - params: Request parameters - timeout: Request timeout in seconds - - Returns: - Response result - """ - state.request_id += 1 - request_id = state.request_id - - message = { - "jsonrpc": "2.0", - "id": request_id, - "method": method, - "params": params or {}, - } - - future: asyncio.Future = asyncio.get_event_loop().create_future() - state.pending_requests[request_id] = future - - try: - encoded = self._encode_message(message) - logger.debug(f"Sending request id={request_id}, method={method}") - state.writer.write(encoded) - await state.writer.drain() - - return await asyncio.wait_for( - future, - timeout=timeout or self.timeout - ) - except asyncio.TimeoutError: - state.pending_requests.pop(request_id, None) - logger.warning(f"Request timed out: {method}") - return None - except Exception as e: - state.pending_requests.pop(request_id, None) - logger.error(f"Request failed: {method} - {e}") - return None - - async def _send_notification( - self, - state: ServerState, - method: str, - params: Optional[Dict[str, Any]], - ) -> None: - """Send a notification to the language server (no response expected).""" - message = { - "jsonrpc": "2.0", - "method": method, - "params": params or {}, - } - - try: - encoded = self._encode_message(message) - logger.debug(f"Sending notification: {method} ({len(encoded)} bytes)") - state.writer.write(encoded) - await state.writer.drain() - logger.debug(f"Notification sent: {method}") - except Exception as e: - logger.error(f"Failed to send notification {method}: {e}") - - def _to_text_document_identifier(self, file_path: str) -> Dict[str, str]: - """Create TextDocumentIdentifier from file path.""" - file_path = self._normalize_file_path(file_path) - uri = Path(file_path).resolve().as_uri() - return {"uri": uri} - - def _to_position(self, line: int, character: int) -> Dict[str, int]: - """Create LSP Position (0-indexed) from 1-indexed line/character.""" - return { - "line": max(0, line - 1), # Convert 1-indexed to 0-indexed - "character": max(0, character - 1), - } - - async def _open_document(self, state: ServerState, file_path: str) -> None: - """Send textDocument/didOpen notification.""" - file_path = self._normalize_file_path(file_path) - resolved_path = Path(file_path).resolve() - - # Fast path: already opened and unchanged (per-server cache). - try: - uri = resolved_path.as_uri() - except Exception: - uri = "" - - try: - file_mtime = float(resolved_path.stat().st_mtime) - except Exception: - file_mtime = 0.0 - - # Serialize open/change notifications per server to avoid races when - # multiple concurrent LSP requests target the same file. - async with state.opened_documents_lock: - existing = state.opened_documents.get(uri) if uri else None - if existing is not None and existing[1] == file_mtime: - return - - try: - content = resolved_path.read_text(encoding="utf-8") - except Exception as e: - logger.error(f"Failed to read file {file_path}: {e}") - return - - # Detect language ID from extension - language_id = self.get_language_id(file_path) or "plaintext" - - # Send didOpen only once per document; subsequent changes use didChange. - if existing is None: - version = 1 - logger.debug(f"Opening document: {resolved_path.name} ({len(content)} chars)") - await self._send_notification( - state, - "textDocument/didOpen", - { - "textDocument": { - "uri": uri or resolved_path.as_uri(), - "languageId": language_id, - "version": version, - "text": content, - } - }, - ) - else: - version = int(existing[0]) + 1 - logger.debug(f"Updating document: {resolved_path.name} ({len(content)} chars)") - await self._send_notification( - state, - "textDocument/didChange", - { - "textDocument": { - "uri": uri or resolved_path.as_uri(), - "version": version, - }, - "contentChanges": [{"text": content}], - }, - ) - - if uri: - state.opened_documents[uri] = (version, file_mtime) - - # ========== Public LSP Methods ========== - - async def get_definition( - self, - file_path: str, - line: int, - character: int, - ) -> Optional[Dict[str, Any]]: - """Get definition location for symbol at position. - - Args: - file_path: Path to the source file - line: Line number (1-indexed) - character: Character position (1-indexed) - - Returns: - Location dict with uri, line, character, or None - """ - state = await self._get_server(file_path) - if not state: - return None - - # Open document first - await self._open_document(state, file_path) - - result = await self._send_request(state, "textDocument/definition", { - "textDocument": self._to_text_document_identifier(file_path), - "position": self._to_position(line, character), - }) - - if not result: - return None - - # Handle single location or array - if isinstance(result, list): - if len(result) == 0: - return None - result = result[0] - - # Handle LocationLink vs Location - if "targetUri" in result: - # LocationLink format - return { - "uri": result["targetUri"], - "range": result.get("targetRange", result.get("targetSelectionRange", {})), - } - else: - # Location format - return result - - async def get_references( - self, - file_path: str, - line: int, - character: int, - include_declaration: bool = True, - ) -> List[Dict[str, Any]]: - """Get all references to symbol at position. - - Args: - file_path: Path to the source file - line: Line number (1-indexed) - character: Character position (1-indexed) - include_declaration: Whether to include the declaration - - Returns: - List of Location dicts with uri and range - """ - state = await self._get_server(file_path) - if not state: - return [] - - # Open document first - await self._open_document(state, file_path) - - result = await self._send_request(state, "textDocument/references", { - "textDocument": self._to_text_document_identifier(file_path), - "position": self._to_position(line, character), - "context": { - "includeDeclaration": include_declaration, - }, - }) - - if not result or not isinstance(result, list): - return [] - - return result - - async def get_hover( - self, - file_path: str, - line: int, - character: int, - ) -> Optional[str]: - """Get hover documentation for symbol at position. - - Args: - file_path: Path to the source file - line: Line number (1-indexed) - character: Character position (1-indexed) - - Returns: - Hover content as string, or None - """ - state = await self._get_server(file_path) - if not state: - return None - - # Open document first - await self._open_document(state, file_path) - - result = await self._send_request(state, "textDocument/hover", { - "textDocument": self._to_text_document_identifier(file_path), - "position": self._to_position(line, character), - }) - - if not result: - return None - - contents = result.get("contents") - if not contents: - return None - - # Parse contents (can be string, MarkedString, MarkupContent, or array) - return self._parse_hover_contents(contents) - - def _parse_hover_contents(self, contents: Any) -> Optional[str]: - """Parse hover contents into string.""" - if isinstance(contents, str): - return contents - - if isinstance(contents, dict): - # MarkupContent or MarkedString - return contents.get("value", contents.get("contents", "")) - - if isinstance(contents, list): - parts = [] - for item in contents: - if isinstance(item, str): - parts.append(item) - elif isinstance(item, dict): - parts.append(item.get("value", "")) - return "\n\n".join(p for p in parts if p) - - return None - - async def get_document_symbols( - self, - file_path: str, - ) -> List[Dict[str, Any]]: - """Get all symbols in a document. - - Args: - file_path: Path to the source file - - Returns: - List of DocumentSymbol or SymbolInformation dicts - """ - state = await self._get_server(file_path) - if not state: - return [] - - # Open document first - await self._open_document(state, file_path) - - result = await self._send_request(state, "textDocument/documentSymbol", { - "textDocument": self._to_text_document_identifier(file_path), - }) - - if not result or not isinstance(result, list): - return [] - - return result - - async def get_call_hierarchy_items( - self, - file_path: str, - line: int, - character: int, - wait_for_analysis: float = 2.0, - ) -> List[Dict[str, Any]]: - """Prepare call hierarchy items for a position. - - Args: - file_path: Path to the source file - line: Line number (1-indexed) - character: Character position (1-indexed) - wait_for_analysis: Time to wait for server analysis (seconds) - - Returns: - List of CallHierarchyItem dicts - """ - state = await self._get_server(file_path) - if not state: - return [] - - # Check if call hierarchy is supported - if not state.capabilities.get("callHierarchyProvider"): - return [] - - # Open document first - await self._open_document(state, file_path) - - # Wait for language server to complete analysis - # This is critical for Pyright to return valid call hierarchy items - if wait_for_analysis > 0: - await asyncio.sleep(wait_for_analysis) - - result = await self._send_request( - state, - "textDocument/prepareCallHierarchy", - { - "textDocument": self._to_text_document_identifier(file_path), - "position": self._to_position(line, character), - }, - ) - - if not result or not isinstance(result, list): - return [] - - return result - - async def get_incoming_calls( - self, - item: Dict[str, Any], - ) -> List[Dict[str, Any]]: - """Get incoming calls for a call hierarchy item. - - Args: - item: CallHierarchyItem from get_call_hierarchy_items - - Returns: - List of CallHierarchyIncomingCall dicts - """ - # Determine language from item's uri - uri = item.get("uri", "") - file_path = self._normalize_file_path(uri) - - state = await self._get_server(file_path) - if not state: - return [] - - result = await self._send_request( - state, - "callHierarchy/incomingCalls", - {"item": item}, - ) - - if not result or not isinstance(result, list): - return [] - - return result - - async def get_outgoing_calls( - self, - item: Dict[str, Any], - ) -> List[Dict[str, Any]]: - """Get outgoing calls for a call hierarchy item. - - Args: - item: CallHierarchyItem from get_call_hierarchy_items - - Returns: - List of CallHierarchyOutgoingCall dicts - """ - # Determine language from item's uri - uri = item.get("uri", "") - file_path = self._normalize_file_path(uri) - - state = await self._get_server(file_path) - if not state: - return [] - - result = await self._send_request( - state, - "callHierarchy/outgoingCalls", - {"item": item}, - ) - - if not result or not isinstance(result, list): - return [] - - return result - - async def __aenter__(self) -> "StandaloneLspManager": - """Async context manager entry.""" - await self.start() - return self - - async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: - """Async context manager exit - stop all servers.""" - await self.stop() - - -# Simple test -if __name__ == "__main__": - async def test_standalone_manager(): - """Test StandaloneLspManager functionality.""" - print("Testing StandaloneLspManager...") - print() - - # Find a Python file to test with - test_file = Path(__file__).resolve() - print(f"Test file: {test_file}") - print() - - async with StandaloneLspManager( - workspace_root=str(test_file.parent.parent.parent.parent), # codex-lens root - timeout=30.0, - ) as manager: - print("1. Testing get_document_symbols...") - symbols = await manager.get_document_symbols(str(test_file)) - print(f" Found {len(symbols)} symbols") - for sym in symbols[:5]: - name = sym.get("name", "?") - kind = sym.get("kind", "?") - print(f" - {name} (kind={kind})") - print() - - print("2. Testing get_definition...") - # Test definition for 'asyncio' import (line 11) - definition = await manager.get_definition(str(test_file), 11, 8) - if definition: - print(f" Definition: {definition}") - else: - print(" No definition found") - print() - - print("3. Testing get_hover...") - hover = await manager.get_hover(str(test_file), 11, 8) - if hover: - print(f" Hover: {hover[:200]}...") - else: - print(" No hover info") - print() - - print("4. Testing get_references...") - refs = await manager.get_references(str(test_file), 50, 10) - print(f" Found {len(refs)} references") - for ref in refs[:3]: - print(f" - {ref}") - - print() - print("Test complete!") - - # Run the test - # Note: On Windows, use default ProactorEventLoop (supports subprocess creation) - - asyncio.run(test_standalone_manager()) diff --git a/codex-lens/src/codexlens/mcp/__init__.py b/codex-lens/src/codexlens/mcp/__init__.py deleted file mode 100644 index 5bb171c3..00000000 --- a/codex-lens/src/codexlens/mcp/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -"""Model Context Protocol implementation for Claude Code integration.""" - -from codexlens.mcp.schema import ( - MCPContext, - SymbolInfo, - ReferenceInfo, - RelatedSymbol, -) -from codexlens.mcp.provider import MCPProvider -from codexlens.mcp.hooks import HookManager, create_context_for_prompt - -__all__ = [ - "MCPContext", - "SymbolInfo", - "ReferenceInfo", - "RelatedSymbol", - "MCPProvider", - "HookManager", - "create_context_for_prompt", -] diff --git a/codex-lens/src/codexlens/mcp/hooks.py b/codex-lens/src/codexlens/mcp/hooks.py deleted file mode 100644 index ad6a2021..00000000 --- a/codex-lens/src/codexlens/mcp/hooks.py +++ /dev/null @@ -1,170 +0,0 @@ -"""Hook interfaces for Claude Code integration.""" - -from __future__ import annotations - -import logging -from pathlib import Path -from typing import Any, Dict, Optional, Callable, TYPE_CHECKING - -from codexlens.mcp.schema import MCPContext - -if TYPE_CHECKING: - from codexlens.mcp.provider import MCPProvider - -logger = logging.getLogger(__name__) - - -class HookManager: - """Manages hook registration and execution.""" - - def __init__(self, mcp_provider: "MCPProvider") -> None: - self.mcp_provider = mcp_provider - self._pre_hooks: Dict[str, Callable] = {} - self._post_hooks: Dict[str, Callable] = {} - - # Register default hooks - self._register_default_hooks() - - def _register_default_hooks(self) -> None: - """Register built-in hooks.""" - self._pre_hooks["explain"] = self._pre_explain_hook - self._pre_hooks["refactor"] = self._pre_refactor_hook - self._pre_hooks["document"] = self._pre_document_hook - - def execute_pre_hook( - self, - action: str, - params: Dict[str, Any], - ) -> Optional[MCPContext]: - """Execute pre-tool hook to gather context. - - Args: - action: The action being performed (e.g., "explain", "refactor") - params: Parameters for the action - - Returns: - MCPContext to inject into prompt, or None - """ - hook = self._pre_hooks.get(action) - - if not hook: - logger.debug(f"No pre-hook for action: {action}") - return None - - try: - return hook(params) - except Exception as e: - logger.error(f"Pre-hook failed for {action}: {e}") - return None - - def execute_post_hook( - self, - action: str, - result: Any, - ) -> None: - """Execute post-tool hook for proactive caching. - - Args: - action: The action that was performed - result: Result of the action - """ - hook = self._post_hooks.get(action) - - if not hook: - return - - try: - hook(result) - except Exception as e: - logger.error(f"Post-hook failed for {action}: {e}") - - def _pre_explain_hook(self, params: Dict[str, Any]) -> Optional[MCPContext]: - """Pre-hook for 'explain' action.""" - symbol_name = params.get("symbol") - - if not symbol_name: - return None - - return self.mcp_provider.build_context( - symbol_name=symbol_name, - context_type="symbol_explanation", - include_references=True, - include_related=True, - ) - - def _pre_refactor_hook(self, params: Dict[str, Any]) -> Optional[MCPContext]: - """Pre-hook for 'refactor' action.""" - symbol_name = params.get("symbol") - - if not symbol_name: - return None - - return self.mcp_provider.build_context( - symbol_name=symbol_name, - context_type="refactor_context", - include_references=True, - include_related=True, - max_references=20, - ) - - def _pre_document_hook(self, params: Dict[str, Any]) -> Optional[MCPContext]: - """Pre-hook for 'document' action.""" - symbol_name = params.get("symbol") - file_path = params.get("file_path") - - if symbol_name: - return self.mcp_provider.build_context( - symbol_name=symbol_name, - context_type="documentation_context", - include_references=False, - include_related=True, - ) - elif file_path: - return self.mcp_provider.build_context_for_file( - Path(file_path), - context_type="file_documentation", - ) - - return None - - def register_pre_hook( - self, - action: str, - hook: Callable[[Dict[str, Any]], Optional[MCPContext]], - ) -> None: - """Register a custom pre-tool hook.""" - self._pre_hooks[action] = hook - - def register_post_hook( - self, - action: str, - hook: Callable[[Any], None], - ) -> None: - """Register a custom post-tool hook.""" - self._post_hooks[action] = hook - - -def create_context_for_prompt( - mcp_provider: "MCPProvider", - action: str, - params: Dict[str, Any], -) -> str: - """Create context string for prompt injection. - - This is the main entry point for Claude Code hook integration. - - Args: - mcp_provider: The MCP provider instance - action: Action being performed - params: Action parameters - - Returns: - Formatted context string for prompt injection - """ - manager = HookManager(mcp_provider) - context = manager.execute_pre_hook(action, params) - - if context: - return context.to_prompt_injection() - - return "" diff --git a/codex-lens/src/codexlens/mcp/provider.py b/codex-lens/src/codexlens/mcp/provider.py deleted file mode 100644 index 97ebc055..00000000 --- a/codex-lens/src/codexlens/mcp/provider.py +++ /dev/null @@ -1,202 +0,0 @@ -"""MCP context provider.""" - -from __future__ import annotations - -import logging -from pathlib import Path -from typing import Optional, List, TYPE_CHECKING - -from codexlens.mcp.schema import ( - MCPContext, - SymbolInfo, - ReferenceInfo, - RelatedSymbol, -) - -if TYPE_CHECKING: - from codexlens.storage.global_index import GlobalSymbolIndex - from codexlens.storage.registry import RegistryStore - from codexlens.search.chain_search import ChainSearchEngine - -logger = logging.getLogger(__name__) - - -class MCPProvider: - """Builds MCP context objects from codex-lens data.""" - - def __init__( - self, - global_index: "GlobalSymbolIndex", - search_engine: "ChainSearchEngine", - registry: "RegistryStore", - ) -> None: - self.global_index = global_index - self.search_engine = search_engine - self.registry = registry - - def build_context( - self, - symbol_name: str, - context_type: str = "symbol_explanation", - include_references: bool = True, - include_related: bool = True, - max_references: int = 10, - ) -> Optional[MCPContext]: - """Build comprehensive context for a symbol. - - Args: - symbol_name: Name of the symbol to contextualize - context_type: Type of context being requested - include_references: Whether to include reference locations - include_related: Whether to include related symbols - max_references: Maximum number of references to include - - Returns: - MCPContext object or None if symbol not found - """ - # Look up symbol - symbols = self.global_index.search(symbol_name, prefix_mode=False, limit=1) - - if not symbols: - logger.debug(f"Symbol not found for MCP context: {symbol_name}") - return None - - symbol = symbols[0] - - # Build SymbolInfo - symbol_info = SymbolInfo( - name=symbol.name, - kind=symbol.kind, - file_path=symbol.file or "", - line_start=symbol.range[0], - line_end=symbol.range[1], - signature=None, # Symbol entity doesn't have signature - documentation=None, # Symbol entity doesn't have docstring - ) - - # Extract definition source code - definition = self._extract_definition(symbol) - - # Get references - references = [] - if include_references: - refs = self.search_engine.search_references( - symbol_name, - limit=max_references, - ) - references = [ - ReferenceInfo( - file_path=r.file_path, - line=r.line, - column=r.column, - context=r.context, - relationship_type=r.relationship_type, - ) - for r in refs - ] - - # Get related symbols - related_symbols = [] - if include_related: - related_symbols = self._get_related_symbols(symbol) - - return MCPContext( - context_type=context_type, - symbol=symbol_info, - definition=definition, - references=references, - related_symbols=related_symbols, - metadata={ - "source": "codex-lens", - }, - ) - - def _extract_definition(self, symbol) -> Optional[str]: - """Extract source code for symbol definition.""" - try: - file_path = Path(symbol.file) if symbol.file else None - if not file_path or not file_path.exists(): - return None - - content = file_path.read_text(encoding='utf-8', errors='ignore') - lines = content.split("\n") - - start = symbol.range[0] - 1 - end = symbol.range[1] - - if start >= len(lines): - return None - - return "\n".join(lines[start:end]) - except Exception as e: - logger.debug(f"Failed to extract definition: {e}") - return None - - def _get_related_symbols(self, symbol) -> List[RelatedSymbol]: - """Get symbols related to the given symbol.""" - related = [] - - try: - # Search for symbols that might be related by name patterns - # This is a simplified implementation - could be enhanced with relationship data - - # Look for imports/callers via reference search - refs = self.search_engine.search_references(symbol.name, limit=20) - - seen_names = set() - for ref in refs: - # Extract potential symbol name from context - if ref.relationship_type and ref.relationship_type not in seen_names: - related.append(RelatedSymbol( - name=f"{Path(ref.file_path).stem}", - kind="module", - relationship=ref.relationship_type, - file_path=ref.file_path, - )) - seen_names.add(ref.relationship_type) - if len(related) >= 10: - break - - except Exception as e: - logger.debug(f"Failed to get related symbols: {e}") - - return related - - def build_context_for_file( - self, - file_path: Path, - context_type: str = "file_overview", - ) -> MCPContext: - """Build context for an entire file.""" - # Try to get symbols by searching with file path - # Note: GlobalSymbolIndex doesn't have search_by_file, so we use a different approach - symbols = [] - - # Search for common symbols that might be in this file - # This is a simplified approach - a full implementation would query by file path - try: - # Use the global index to search for symbols from this file - file_str = str(file_path.resolve()) - # Get all symbols and filter by file path (not efficient but works) - all_symbols = self.global_index.search("", prefix_mode=True, limit=1000) - symbols = [s for s in all_symbols if s.file and str(Path(s.file).resolve()) == file_str] - except Exception as e: - logger.debug(f"Failed to get file symbols: {e}") - - related = [ - RelatedSymbol( - name=s.name, - kind=s.kind, - relationship="defines", - ) - for s in symbols - ] - - return MCPContext( - context_type=context_type, - related_symbols=related, - metadata={ - "file_path": str(file_path), - "symbol_count": len(symbols), - }, - ) diff --git a/codex-lens/src/codexlens/mcp/schema.py b/codex-lens/src/codexlens/mcp/schema.py deleted file mode 100644 index 1062e626..00000000 --- a/codex-lens/src/codexlens/mcp/schema.py +++ /dev/null @@ -1,113 +0,0 @@ -"""MCP data models.""" - -from __future__ import annotations - -import json -from dataclasses import dataclass, field, asdict -from typing import List, Optional - - -@dataclass -class SymbolInfo: - """Information about a code symbol.""" - name: str - kind: str - file_path: str - line_start: int - line_end: int - signature: Optional[str] = None - documentation: Optional[str] = None - - def to_dict(self) -> dict: - return {k: v for k, v in asdict(self).items() if v is not None} - - -@dataclass -class ReferenceInfo: - """Information about a symbol reference.""" - file_path: str - line: int - column: int - context: str - relationship_type: str - - def to_dict(self) -> dict: - return asdict(self) - - -@dataclass -class RelatedSymbol: - """Related symbol (import, call target, etc.).""" - name: str - kind: str - relationship: str # "imports", "calls", "inherits", "uses" - file_path: Optional[str] = None - - def to_dict(self) -> dict: - return {k: v for k, v in asdict(self).items() if v is not None} - - -@dataclass -class MCPContext: - """Model Context Protocol context object. - - This is the structured context that gets injected into - LLM prompts to provide code understanding. - """ - version: str = "1.0" - context_type: str = "code_context" - symbol: Optional[SymbolInfo] = None - definition: Optional[str] = None - references: List[ReferenceInfo] = field(default_factory=list) - related_symbols: List[RelatedSymbol] = field(default_factory=list) - metadata: dict = field(default_factory=dict) - - def to_dict(self) -> dict: - """Convert to dictionary for JSON serialization.""" - result = { - "version": self.version, - "context_type": self.context_type, - "metadata": self.metadata, - } - - if self.symbol: - result["symbol"] = self.symbol.to_dict() - if self.definition: - result["definition"] = self.definition - if self.references: - result["references"] = [r.to_dict() for r in self.references] - if self.related_symbols: - result["related_symbols"] = [s.to_dict() for s in self.related_symbols] - - return result - - def to_json(self, indent: int = 2) -> str: - """Serialize to JSON string.""" - return json.dumps(self.to_dict(), indent=indent) - - def to_prompt_injection(self) -> str: - """Format for injection into LLM prompt.""" - parts = [""] - - if self.symbol: - parts.append(f"## Symbol: {self.symbol.name}") - parts.append(f"Type: {self.symbol.kind}") - parts.append(f"Location: {self.symbol.file_path}:{self.symbol.line_start}") - - if self.definition: - parts.append("\n## Definition") - parts.append(f"```\n{self.definition}\n```") - - if self.references: - parts.append(f"\n## References ({len(self.references)} found)") - for ref in self.references[:5]: # Limit to 5 - parts.append(f"- {ref.file_path}:{ref.line} ({ref.relationship_type})") - parts.append(f" ```\n {ref.context}\n ```") - - if self.related_symbols: - parts.append("\n## Related Symbols") - for sym in self.related_symbols[:10]: # Limit to 10 - parts.append(f"- {sym.name} ({sym.relationship})") - - parts.append("") - return "\n".join(parts) diff --git a/codex-lens/src/codexlens/parsers/__init__.py b/codex-lens/src/codexlens/parsers/__init__.py deleted file mode 100644 index a96ed9a7..00000000 --- a/codex-lens/src/codexlens/parsers/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Parsers for CodexLens.""" - -from __future__ import annotations - -from .factory import ParserFactory -from .astgrep_binding import AstGrepBinding, is_astgrep_available, get_supported_languages - -__all__ = [ - "ParserFactory", - "AstGrepBinding", - "is_astgrep_available", - "get_supported_languages", -] - diff --git a/codex-lens/src/codexlens/parsers/astgrep_binding.py b/codex-lens/src/codexlens/parsers/astgrep_binding.py deleted file mode 100644 index 16985961..00000000 --- a/codex-lens/src/codexlens/parsers/astgrep_binding.py +++ /dev/null @@ -1,320 +0,0 @@ -"""ast-grep based parser binding for CodexLens. - -Provides AST-level pattern matching via ast-grep-py (PyO3 bindings). - -Note: This module wraps the official ast-grep Python bindings for pattern-based -code analysis. If ast-grep-py is unavailable, the parser returns None gracefully. -Callers should use tree-sitter or regex-based fallbacks. -""" - -from __future__ import annotations - -import logging -from pathlib import Path -from typing import Dict, List, Optional, Tuple - -# Import patterns from centralized definition (avoid duplication) -from codexlens.parsers.patterns.python import get_pattern, PYTHON_PATTERNS - -# Graceful import pattern following treesitter_parser.py convention -try: - from ast_grep_py import SgNode, SgRoot - ASTGREP_AVAILABLE = True -except ImportError: - SgNode = None # type: ignore[assignment,misc] - SgRoot = None # type: ignore[assignment,misc] - ASTGREP_AVAILABLE = False - -log = logging.getLogger(__name__) - - -class AstGrepBinding: - """Wrapper for ast-grep-py bindings with CodexLens integration. - - Provides pattern-based AST matching for code relationship extraction. - Uses declarative patterns with metavariables ($A, $$ARGS) for matching. - """ - - # Language ID mapping to ast-grep language names - LANGUAGE_MAP = { - "python": "python", - "javascript": "javascript", - "typescript": "typescript", - "tsx": "tsx", - } - - def __init__(self, language_id: str, path: Optional[Path] = None) -> None: - """Initialize ast-grep binding for a language. - - Args: - language_id: Language identifier (python, javascript, typescript, tsx) - path: Optional file path for language variant detection - """ - self.language_id = language_id - self.path = path - self._language: Optional[str] = None - self._root: Optional[SgRoot] = None # type: ignore[valid-type] - - if ASTGREP_AVAILABLE: - self._initialize_language() - - def _initialize_language(self) -> None: - """Initialize ast-grep language setting.""" - # Detect TSX from file extension - if self.language_id == "typescript" and self.path is not None: - if self.path.suffix.lower() == ".tsx": - self._language = "tsx" - return - - self._language = self.LANGUAGE_MAP.get(self.language_id) - - def is_available(self) -> bool: - """Check if ast-grep binding is available and ready. - - Returns: - True if ast-grep-py is installed and language is supported - """ - return ASTGREP_AVAILABLE and self._language is not None - - def parse(self, source_code: str) -> bool: - """Parse source code into ast-grep syntax tree. - - Args: - source_code: Source code text to parse - - Returns: - True if parsing succeeds, False otherwise - """ - if not self.is_available() or SgRoot is None: - return False - - try: - self._root = SgRoot(source_code, self._language) # type: ignore[misc] - return True - except (ValueError, TypeError, RuntimeError) as e: - log.debug(f"ast-grep parse error: {e}") - self._root = None - return False - - def find_all(self, pattern: str) -> List[SgNode]: # type: ignore[valid-type] - """Find all matches for a pattern in the parsed source. - - Args: - pattern: ast-grep pattern string (e.g., "class $NAME($$$BASES) $$$BODY") - - Returns: - List of matching SgNode objects, empty if no matches or not parsed - """ - if not self.is_available() or self._root is None: - return [] - - try: - root_node = self._root.root() - # ast-grep-py 0.40+ requires dict config format - config = {"rule": {"pattern": pattern}} - return list(root_node.find_all(config)) - except (ValueError, TypeError, AttributeError) as e: - log.debug(f"ast-grep find_all error: {e}") - return [] - - def find_inheritance(self) -> List[Dict[str, str]]: - """Find all class inheritance declarations. - - Returns: - List of dicts with 'class_name' and 'bases' keys - """ - if self.language_id != "python": - return [] - - matches = self.find_all(get_pattern("class_with_bases")) - results: List[Dict[str, str]] = [] - - for node in matches: - class_name = self._get_match(node, "NAME") - if class_name: - results.append({ - "class_name": class_name, - "bases": self._get_match(node, "BASES"), # Base classes text - }) - - return results - - def find_calls(self) -> List[Dict[str, str]]: - """Find all function/method calls. - - Returns: - List of dicts with 'function' and 'line' keys - """ - if self.language_id != "python": - return [] - - matches = self.find_all(get_pattern("call")) - results: List[Dict[str, str]] = [] - - for node in matches: - func_name = self._get_match(node, "FUNC") - if func_name: - # Skip self. and cls. prefixed calls - base = func_name.split(".", 1)[0] - if base not in {"self", "cls"}: - results.append({ - "function": func_name, - "line": str(self._get_line_number(node)), - }) - - return results - - def find_imports(self) -> List[Dict[str, str]]: - """Find all import statements. - - Returns: - List of dicts with 'module' and 'type' keys - """ - if self.language_id != "python": - return [] - - results: List[Dict[str, str]] = [] - - # Find 'import X' statements - import_matches = self.find_all(get_pattern("import_stmt")) - for node in import_matches: - module = self._get_match(node, "MODULE") - if module: - results.append({ - "module": module, - "type": "import", - "line": str(self._get_line_number(node)), - }) - - # Find 'from X import Y' statements - from_matches = self.find_all(get_pattern("import_from")) - for node in from_matches: - module = self._get_match(node, "MODULE") - names = self._get_match(node, "NAMES") - if module: - results.append({ - "module": module, - "names": names or "", - "type": "from_import", - "line": str(self._get_line_number(node)), - }) - - return results - - def _get_match(self, node: SgNode, metavar: str) -> str: # type: ignore[valid-type] - """Extract matched metavariable value from node. - - Args: - node: SgNode with match - metavar: Metavariable name (without $ prefix) - - Returns: - Matched text or empty string - """ - if node is None: - return "" - try: - match = node.get_match(metavar) - if match is not None: - return match.text() - except (ValueError, AttributeError, KeyError) as e: - log.debug(f"ast-grep get_match error for {metavar}: {e}") - return "" - - def _get_node_text(self, node: SgNode) -> str: # type: ignore[valid-type] - """Get full text of a node. - - Args: - node: SgNode to extract text from - - Returns: - Node's text content - """ - if node is None: - return "" - try: - return node.text() - except (ValueError, AttributeError) as e: - log.debug(f"ast-grep get_node_text error: {e}") - return "" - - def _get_line_number(self, node: SgNode) -> int: # type: ignore[valid-type] - """Get starting line number of a node. - - Args: - node: SgNode to get line number for - - Returns: - 1-based line number - """ - if node is None: - return 0 - try: - range_info = node.range() - # ast-grep-py 0.40+ returns Range object with .start.line attribute - if hasattr(range_info, 'start') and hasattr(range_info.start, 'line'): - return range_info.start.line + 1 # Convert to 1-based - # Fallback for string format "(0,0)-(1,8)" - if isinstance(range_info, str) and range_info: - start_part = range_info.split('-')[0].strip('()') - start_line = int(start_part.split(',')[0]) - return start_line + 1 - except (ValueError, AttributeError, TypeError, IndexError) as e: - log.debug(f"ast-grep get_line_number error: {e}") - return 0 - - def _get_line_range(self, node: SgNode) -> Tuple[int, int]: # type: ignore[valid-type] - """Get line range (start, end) of a node. - - Args: - node: SgNode to get line range for - - Returns: - Tuple of (start_line, end_line), both 1-based inclusive - """ - if node is None: - return (0, 0) - try: - range_info = node.range() - # ast-grep-py 0.40+ returns Range object with .start.line and .end.line - if hasattr(range_info, 'start') and hasattr(range_info, 'end'): - start_line = getattr(range_info.start, 'line', 0) - end_line = getattr(range_info.end, 'line', 0) - return (start_line + 1, end_line + 1) # Convert to 1-based - # Fallback for string format "(0,0)-(1,8)" - if isinstance(range_info, str) and range_info: - parts = range_info.split('-') - start_part = parts[0].strip('()') - end_part = parts[1].strip('()') - start_line = int(start_part.split(',')[0]) - end_line = int(end_part.split(',')[0]) - return (start_line + 1, end_line + 1) - except (ValueError, AttributeError, TypeError, IndexError) as e: - log.debug(f"ast-grep get_line_range error: {e}") - return (0, 0) - - def get_language(self) -> Optional[str]: - """Get the configured ast-grep language. - - Returns: - Language string or None if not configured - """ - return self._language - - -def is_astgrep_available() -> bool: - """Check if ast-grep-py is installed and available. - - Returns: - True if ast-grep bindings can be imported - """ - return ASTGREP_AVAILABLE - - -def get_supported_languages() -> List[str]: - """Get list of supported languages for ast-grep. - - Returns: - List of language identifiers - """ - return list(AstGrepBinding.LANGUAGE_MAP.keys()) diff --git a/codex-lens/src/codexlens/parsers/astgrep_js_ts_processor.py b/codex-lens/src/codexlens/parsers/astgrep_js_ts_processor.py deleted file mode 100644 index beff64d9..00000000 --- a/codex-lens/src/codexlens/parsers/astgrep_js_ts_processor.py +++ /dev/null @@ -1,306 +0,0 @@ -"""Ast-grep processors for JavaScript/TypeScript relationship extraction. - -These processors are intentionally narrower than the tree-sitter relationship -extractor: they focus on stable, high-signal edges for static graph usage: -- IMPORTS: ES module imports + CommonJS require() (string literal only) -- INHERITS: class/interface extends - -They are used when Config.use_astgrep is True. -""" - -from __future__ import annotations - -import re -from pathlib import Path -from typing import Callable, List, Optional, Sequence, Set, Tuple - -from codexlens.entities import CodeRelationship, IndexedFile, RelationshipType -from codexlens.parsers.astgrep_processor import BaseAstGrepProcessor - - -_IDENT_RE = re.compile(r"^[A-Za-z_$][A-Za-z0-9_$]*$") -_BRACE_IMPORT_RE = re.compile( - r"\bimport\s+(?:type\s+)?(?:[A-Za-z_$][A-Za-z0-9_$]*\s*,\s*)?\{\s*(?P[^}]*)\}\s*from\b", - re.MULTILINE, -) - - -def _strip_quotes(value: str) -> str: - value = (value or "").strip() - if len(value) >= 2 and value[0] == value[-1] and value[0] in {"'", '"', "`"}: - return value[1:-1] - return value - - -def _module_from_literal(raw: str) -> str: - raw = (raw or "").strip() - if not raw: - return "" - return _strip_quotes(raw).strip() - - -def _extract_named_imports(raw: str) -> List[str]: - raw = (raw or "").strip() - if not raw: - return [] - - # Normalize any surrounding braces the match might include. - if raw.startswith("{") and raw.endswith("}"): - raw = raw[1:-1].strip() - - # Split by commas at top-level; named imports do not nest in JS/TS syntax. - parts = [p.strip() for p in raw.split(",") if p.strip()] - names: List[str] = [] - for part in parts: - # TS: "type Foo" inside braces - if part.startswith("type "): - part = part[5:].strip() - # Handle `foo as bar` (TS) / `foo as bar` (proposed) / `foo as bar`-style text. - if " as " in part: - part = part.split(" as ", 1)[0].strip() - if _IDENT_RE.match(part): - names.append(part) - return names - - -def _extract_brace_import_names(statement: str) -> str: - statement = (statement or "").strip() - if not statement: - return "" - match = _BRACE_IMPORT_RE.search(statement) - if not match: - return "" - return (match.group("names") or "").strip() - - -def _dedupe_relationships(rels: Sequence[CodeRelationship]) -> List[CodeRelationship]: - seen: Set[Tuple[str, str, str]] = set() - out: List[CodeRelationship] = [] - for r in rels: - key = (r.source_symbol, r.target_symbol, r.relationship_type.value) - if key in seen: - continue - seen.add(key) - out.append(r) - return out - - -class _AstGrepJsTsProcessor(BaseAstGrepProcessor): - def __init__( - self, - language_id: str, - *, - path: Optional[Path] = None, - get_pattern: Callable[[str], str], - ) -> None: - super().__init__(language_id, path) - self._get_pattern = get_pattern - - def parse(self, text: str, path: Path) -> Optional[IndexedFile]: - if not self.is_available(): - return None - - try: - relationships = self._extract_relationships(text, path) - return IndexedFile( - path=str(path.resolve()), - language=self.language_id, - symbols=[], - chunks=[], - relationships=relationships, - ) - except Exception: - return None - - def process_matches( # type: ignore[override] - self, - matches, # SgNode list (runtime-only type) - source_code: str, - path: Path, - ) -> List[CodeRelationship]: - # Not used by the current JS/TS processors; keep the interface for parity. - _ = (matches, source_code, path) - return [] - - def _extract_relationships(self, source_code: str, path: Path) -> List[CodeRelationship]: - source_file = str(path.resolve()) - rels: List[CodeRelationship] = [] - - rels.extend(self._extract_imports(source_code, source_file=source_file)) - rels.extend(self._extract_inherits(source_code, source_file=source_file)) - - return _dedupe_relationships(rels) - - def _extract_imports(self, source_code: str, *, source_file: str) -> List[CodeRelationship]: - rels: List[CodeRelationship] = [] - - def record(module_name: str, line: int) -> None: - if not module_name: - return - rels.append( - CodeRelationship( - source_symbol="", - target_symbol=module_name, - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=line, - ) - ) - - # Any `import ... from "mod"` form - for pat_name in ("import_from_dq", "import_from_sq"): - for node in self.run_ast_grep(source_code, self._get_pattern(pat_name)): - mod = _module_from_literal(self._get_match(node, "MODULE")) - if mod: - record(mod, self._get_line_number(node)) - - # Side-effect import: import "mod" - for pat_name in ("import_side_effect_dq", "import_side_effect_sq"): - for node in self.run_ast_grep(source_code, self._get_pattern(pat_name)): - mod = _module_from_literal(self._get_match(node, "MODULE")) - if mod: - record(mod, self._get_line_number(node)) - - # Named imports (named-only): import { a, b as c } from "mod" - for pat_name in ("import_named_only_dq", "import_named_only_sq"): - for node in self.run_ast_grep(source_code, self._get_pattern(pat_name)): - mod = _module_from_literal(self._get_match(node, "MODULE")) - if not mod: - continue - raw_names = _extract_brace_import_names(self._get_node_text(node)) - for name in _extract_named_imports(raw_names): - record(f"{mod}.{name}", self._get_line_number(node)) - - # Named imports (default + named): import X, { a, b as c } from "mod" - for pat_name in ("import_default_named_dq", "import_default_named_sq"): - for node in self.run_ast_grep(source_code, self._get_pattern(pat_name)): - mod = _module_from_literal(self._get_match(node, "MODULE")) - if not mod: - continue - raw_names = _extract_brace_import_names(self._get_node_text(node)) - for name in _extract_named_imports(raw_names): - record(f"{mod}.{name}", self._get_line_number(node)) - - # CommonJS require("mod") (string literal only) - for pat_name in ("require_call_dq", "require_call_sq"): - for node in self.run_ast_grep(source_code, self._get_pattern(pat_name)): - mod = _module_from_literal(self._get_match(node, "MODULE")) - if mod: - record(mod, self._get_line_number(node)) - - return rels - - def _extract_inherits(self, source_code: str, *, source_file: str) -> List[CodeRelationship]: - rels: List[CodeRelationship] = [] - - for node in self.run_ast_grep(source_code, self._get_pattern("class_extends")): - class_name = (self._get_match(node, "NAME") or "").strip() - base_raw = (self._get_match(node, "BASE") or "").strip() - if not class_name or not base_raw: - continue - base = base_raw.split("<", 1)[0].strip() - if not base: - continue - rels.append( - CodeRelationship( - source_symbol=class_name, - target_symbol=base, - relationship_type=RelationshipType.INHERITS, - source_file=source_file, - target_file=None, - source_line=self._get_line_number(node), - ) - ) - - return rels - - -class AstGrepJavaScriptProcessor(_AstGrepJsTsProcessor): - def __init__(self, path: Optional[Path] = None) -> None: - from codexlens.parsers.patterns.javascript import get_pattern as get_js_pattern - - super().__init__("javascript", path=path, get_pattern=get_js_pattern) - - -class AstGrepTypeScriptProcessor(_AstGrepJsTsProcessor): - def __init__(self, path: Optional[Path] = None) -> None: - from codexlens.parsers.patterns.typescript import get_pattern as get_ts_pattern - - super().__init__("typescript", path=path, get_pattern=get_ts_pattern) - - def _extract_inherits(self, source_code: str, *, source_file: str) -> List[CodeRelationship]: - rels = super()._extract_inherits(source_code, source_file=source_file) - - # Interface extends: interface Foo extends Bar {} - for node in self.run_ast_grep(source_code, self._get_pattern("interface_extends")): - name = (self._get_match(node, "NAME") or "").strip() - base_raw = (self._get_match(node, "BASE") or "").strip() - if not name or not base_raw: - continue - base = base_raw.split("<", 1)[0].strip() - if not base: - continue - rels.append( - CodeRelationship( - source_symbol=name, - target_symbol=base, - relationship_type=RelationshipType.INHERITS, - source_file=source_file, - target_file=None, - source_line=self._get_line_number(node), - ) - ) - - return _dedupe_relationships(rels) - - def _extract_imports(self, source_code: str, *, source_file: str) -> List[CodeRelationship]: - # Reuse JS logic for standard imports - rels = super()._extract_imports(source_code, source_file=source_file) - - def record(module_name: str, line: int) -> None: - if not module_name: - return - rels.append( - CodeRelationship( - source_symbol="", - target_symbol=module_name, - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=line, - ) - ) - - # Type-only imports: import type ... from "mod" - for pat_name in ("import_type_from_dq", "import_type_from_sq"): - for node in self.run_ast_grep(source_code, self._get_pattern(pat_name)): - mod = _module_from_literal(self._get_match(node, "MODULE")) - if mod: - record(mod, self._get_line_number(node)) - - for pat_name in ("import_type_named_only_dq", "import_type_named_only_sq"): - for node in self.run_ast_grep(source_code, self._get_pattern(pat_name)): - mod = _module_from_literal(self._get_match(node, "MODULE")) - if not mod: - continue - raw_names = _extract_brace_import_names(self._get_node_text(node)) - for name in _extract_named_imports(raw_names): - record(f"{mod}.{name}", self._get_line_number(node)) - - for pat_name in ("import_type_default_named_dq", "import_type_default_named_sq"): - for node in self.run_ast_grep(source_code, self._get_pattern(pat_name)): - mod = _module_from_literal(self._get_match(node, "MODULE")) - if not mod: - continue - raw_names = _extract_brace_import_names(self._get_node_text(node)) - for name in _extract_named_imports(raw_names): - record(f"{mod}.{name}", self._get_line_number(node)) - - return _dedupe_relationships(rels) - - -__all__ = [ - "AstGrepJavaScriptProcessor", - "AstGrepTypeScriptProcessor", -] diff --git a/codex-lens/src/codexlens/parsers/astgrep_processor.py b/codex-lens/src/codexlens/parsers/astgrep_processor.py deleted file mode 100644 index 9e2546d0..00000000 --- a/codex-lens/src/codexlens/parsers/astgrep_processor.py +++ /dev/null @@ -1,1033 +0,0 @@ -"""Ast-grep based processor for Python relationship extraction. - -Provides pattern-based AST matching for extracting code relationships -(inheritance, calls, imports) from Python source code. - -This processor wraps the ast-grep-py bindings and provides a higher-level -interface for relationship extraction, similar to TreeSitterSymbolParser. - -Design Pattern: - - Follows TreeSitterSymbolParser class structure for consistency - - Uses declarative patterns defined in patterns/python/__init__.py - - Provides scope-aware relationship extraction with alias resolution -""" - -from __future__ import annotations - -from abc import ABC, abstractmethod -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple - -from codexlens.entities import CodeRelationship, IndexedFile, RelationshipType, Symbol - -# Import patterns module -from codexlens.parsers.patterns.python import ( - PYTHON_PATTERNS, - get_pattern, - get_metavar, -) - -# Graceful import pattern following existing convention -try: - from ast_grep_py import SgNode, SgRoot - from codexlens.parsers.astgrep_binding import AstGrepBinding, ASTGREP_AVAILABLE -except ImportError: - SgNode = None # type: ignore[assignment,misc] - SgRoot = None # type: ignore[assignment,misc] - AstGrepBinding = None # type: ignore[assignment,misc] - ASTGREP_AVAILABLE = False - - -class BaseAstGrepProcessor(ABC): - """Abstract base class for ast-grep based processors. - - Provides common infrastructure for pattern-based AST processing. - Subclasses implement language-specific pattern processing logic. - """ - - def __init__(self, language_id: str, path: Optional[Path] = None) -> None: - """Initialize processor for a language. - - Args: - language_id: Language identifier (python, javascript, typescript) - path: Optional file path for language variant detection - """ - self.language_id = language_id - self.path = path - self._binding: Optional[AstGrepBinding] = None - - if ASTGREP_AVAILABLE and AstGrepBinding is not None: - self._binding = AstGrepBinding(language_id, path) - - def is_available(self) -> bool: - """Check if ast-grep processor is available. - - Returns: - True if ast-grep binding is ready - """ - return self._binding is not None and self._binding.is_available() - - def run_ast_grep(self, source_code: str, pattern: str) -> List[SgNode]: # type: ignore[valid-type] - """Execute ast-grep pattern matching on source code. - - Args: - source_code: Source code text to analyze - pattern: ast-grep pattern string - - Returns: - List of matching SgNode objects, empty if no matches or unavailable - """ - if not self.is_available() or self._binding is None: - return [] - - if not self._binding.parse(source_code): - return [] - - return self._binding.find_all(pattern) - - def _get_match(self, node: SgNode, metavar: str) -> str: # type: ignore[valid-type] - """Extract matched metavariable value from node (best-effort).""" - if self._binding is None or node is None: - return "" - return self._binding._get_match(node, metavar) - - def _get_line_number(self, node: SgNode) -> int: # type: ignore[valid-type] - """Get 1-based starting line number of a node (best-effort).""" - if self._binding is None or node is None: - return 0 - return self._binding._get_line_number(node) - - def _get_line_range(self, node: SgNode) -> Tuple[int, int]: # type: ignore[valid-type] - """Get (start_line, end_line) range of a node (best-effort).""" - if self._binding is None or node is None: - return (0, 0) - return self._binding._get_line_range(node) - - def _get_node_text(self, node: SgNode) -> str: # type: ignore[valid-type] - """Get the full text of a node (best-effort).""" - if self._binding is None or node is None: - return "" - return self._binding._get_node_text(node) - - @abstractmethod - def process_matches( - self, - matches: List[SgNode], # type: ignore[valid-type] - source_code: str, - path: Path, - ) -> List[CodeRelationship]: - """Process ast-grep matches into code relationships. - - Args: - matches: List of matched SgNode objects - source_code: Original source code - path: File path being processed - - Returns: - List of extracted code relationships - """ - pass - - @abstractmethod - def parse(self, text: str, path: Path) -> Optional[IndexedFile]: - """Parse source code and extract relationships. - - Args: - text: Source code text - path: File path - - Returns: - IndexedFile with symbols and relationships, None if unavailable - """ - pass - - -class AstGrepPythonProcessor(BaseAstGrepProcessor): - """Python-specific ast-grep processor for relationship extraction. - - Extracts INHERITS, CALLS, and IMPORTS relationships from Python code - using declarative ast-grep patterns with scope-aware processing. - """ - - def __init__(self, path: Optional[Path] = None) -> None: - """Initialize Python processor. - - Args: - path: Optional file path (for consistency with base class) - """ - super().__init__("python", path) - - def parse(self, text: str, path: Path) -> Optional[IndexedFile]: - """Parse Python source code and extract relationships. - - Args: - text: Python source code text - path: File path - - Returns: - IndexedFile with symbols and relationships, None if unavailable - """ - if not self.is_available(): - return None - - try: - symbols = self._extract_symbols(text) - relationships = self._extract_relationships(text, path) - - return IndexedFile( - path=str(path.resolve()), - language="python", - symbols=symbols, - chunks=[], - relationships=relationships, - ) - except (ValueError, TypeError, AttributeError) as e: - # Log specific parsing errors for debugging - import logging - logging.getLogger(__name__).debug(f"ast-grep parsing error: {e}") - return None - - def _extract_symbols(self, source_code: str) -> List[Symbol]: - """Extract Python symbols (classes, functions, methods). - - Args: - source_code: Python source code - - Returns: - List of Symbol objects - """ - symbols: List[Symbol] = [] - - # Collect all scope definitions with line ranges for proper method detection - # Format: (start_line, end_line, kind, name) - scope_defs: List[Tuple[int, int, str, str]] = [] - - # Track async function positions to avoid duplicates - async_positions: set = set() - - # Extract class definitions - class_matches = self.run_ast_grep(source_code, get_pattern("class_def")) - for node in class_matches: - name = self._get_match(node, "NAME") - if name: - start_line, end_line = self._get_line_range(node) - scope_defs.append((start_line, end_line, "class", name)) - - # Extract async function definitions FIRST (before regular functions) - async_matches = self.run_ast_grep(source_code, get_pattern("async_func_def")) - for node in async_matches: - name = self._get_match(node, "NAME") - if name: - start_line, end_line = self._get_line_range(node) - scope_defs.append((start_line, end_line, "function", name)) - async_positions.add(start_line) # Mark this position as async - - # Extract function definitions (skip those already captured as async) - func_matches = self.run_ast_grep(source_code, get_pattern("func_def")) - for node in func_matches: - name = self._get_match(node, "NAME") - if name: - start_line, end_line = self._get_line_range(node) - # Skip if already captured as async function (same position) - if start_line not in async_positions: - scope_defs.append((start_line, end_line, "function", name)) - - # Sort by start line for scope-aware processing - scope_defs.sort(key=lambda x: x[0]) - - # Process with scope tracking to determine method vs function - scope_stack: List[Tuple[str, int, str]] = [] # (name, end_line, kind) - - for start_line, end_line, kind, name in scope_defs: - # Pop scopes that have ended - while scope_stack and scope_stack[-1][1] < start_line: - scope_stack.pop() - - if kind == "class": - symbols.append(Symbol( - name=name, - kind="class", - range=(start_line, end_line), - )) - scope_stack.append((name, end_line, "class")) - else: # function - # Determine if it's a method (inside a class) or function - is_method = bool(scope_stack) and scope_stack[-1][2] == "class" - symbols.append(Symbol( - name=name, - kind="method" if is_method else "function", - range=(start_line, end_line), - )) - scope_stack.append((name, end_line, "function")) - - return symbols - - def _extract_relationships(self, source_code: str, path: Path) -> List[CodeRelationship]: - """Extract code relationships with scope and alias resolution. - - Args: - source_code: Python source code - path: File path - - Returns: - List of CodeRelationship objects - """ - if not self.is_available() or self._binding is None: - return [] - - source_file = str(path.resolve()) - - # Collect all matches with line numbers and end lines for scope processing - # Format: (start_line, end_line, match_type, symbol, node) - all_matches: List[Tuple[int, int, str, str, Any]] = [] - - # Get class definitions (with and without bases) for scope tracking - class_with_bases = self.run_ast_grep(source_code, get_pattern("class_with_bases")) - for node in class_with_bases: - class_name = self._get_match(node, "NAME") - start_line, end_line = self._get_line_range(node) - if class_name: - # Record class scope and inheritance - all_matches.append((start_line, end_line, "class_def", class_name, node)) - # Extract bases from node text (ast-grep-py 0.40+ doesn't capture $$$) - node_text = self._binding._get_node_text(node) if self._binding else "" - bases_text = self._extract_bases_from_class_text(node_text) - if bases_text: - # Also record inheritance relationship - all_matches.append((start_line, end_line, "inherits", bases_text, node)) - - # Get classes without bases for scope tracking - class_no_bases = self.run_ast_grep(source_code, get_pattern("class_def")) - for node in class_no_bases: - class_name = self._get_match(node, "NAME") - start_line, end_line = self._get_line_range(node) - if class_name: - # Check if not already recorded (avoid duplicates from class_with_bases) - existing = [m for m in all_matches if m[2] == "class_def" and m[3] == class_name and m[0] == start_line] - if not existing: - all_matches.append((start_line, end_line, "class_def", class_name, node)) - - # Get function definitions for scope tracking - func_matches = self.run_ast_grep(source_code, get_pattern("func_def")) - for node in func_matches: - func_name = self._get_match(node, "NAME") - start_line, end_line = self._get_line_range(node) - if func_name: - all_matches.append((start_line, end_line, "func_def", func_name, node)) - - # Get async function definitions for scope tracking - async_func_matches = self.run_ast_grep(source_code, get_pattern("async_func_def")) - for node in async_func_matches: - func_name = self._get_match(node, "NAME") - start_line, end_line = self._get_line_range(node) - if func_name: - all_matches.append((start_line, end_line, "func_def", func_name, node)) - - # Get import matches (process import_with_alias first to avoid duplicates) - import_alias_positions: set = set() - - # Process import with alias: import X as Y - import_alias_matches = self.run_ast_grep(source_code, get_pattern("import_with_alias")) - for node in import_alias_matches: - module = self._get_match(node, "MODULE") - alias = self._get_match(node, "ALIAS") - start_line, end_line = self._get_line_range(node) - if module and alias: - import_alias_positions.add(start_line) - all_matches.append((start_line, end_line, "import_alias", f"{module}:{alias}", node)) - - # Process simple imports: import X (skip lines with aliases) - import_matches = self.run_ast_grep(source_code, get_pattern("import_stmt")) - for node in import_matches: - module = self._get_match(node, "MODULE") - start_line, end_line = self._get_line_range(node) - if module and start_line not in import_alias_positions: - all_matches.append((start_line, end_line, "import", module, node)) - - from_matches = self.run_ast_grep(source_code, get_pattern("import_from")) - for node in from_matches: - module = self._get_match(node, "MODULE") - names = self._get_match(node, "NAMES") - # Prefer parsing from full node text to handle multiple imports - # (ast-grep-py capture may only include the first name). - try: - node_text = self._binding._get_node_text(node) if self._binding else "" - except Exception: - node_text = "" - parsed_names = self._extract_import_names_from_text(node_text) if node_text else "" - if parsed_names: - names = parsed_names - start_line, end_line = self._get_line_range(node) - if module: - all_matches.append((start_line, end_line, "from_import", f"{module}:{names}", node)) - - # Get call matches - call_matches = self.run_ast_grep(source_code, get_pattern("call")) - for node in call_matches: - func = self._get_match(node, "FUNC") - start_line, end_line = self._get_line_range(node) - if func: - # Skip self. and cls. prefixed calls - base = func.split(".", 1)[0] - if base not in {"self", "cls"}: - all_matches.append((start_line, end_line, "call", func, node)) - - # Sort by start line number for scope processing - all_matches.sort(key=lambda x: (x[0], x[2] == "call")) # Process scope defs before calls on same line - - # Process with scope tracking - relationships = self._process_scope_and_aliases(all_matches, source_file) - - return relationships - - def _process_scope_and_aliases( - self, - matches: List[Tuple[int, int, str, str, Any]], - source_file: str, - ) -> List[CodeRelationship]: - """Process matches with scope and alias resolution. - - Implements proper scope tracking similar to treesitter_parser.py: - - Maintains scope_stack for tracking current scope (class/function names) - - Maintains alias_stack with per-scope alias mappings (inherited from parent) - - Pops scopes when current line passes their end line - - Resolves call targets using current scope's alias map - - Args: - matches: Sorted list of (start_line, end_line, type, symbol, node) tuples - source_file: Source file path - - Returns: - List of resolved CodeRelationship objects - """ - relationships: List[CodeRelationship] = [] - - # Scope stack: list of (name, end_line) tuples - scope_stack: List[Tuple[str, int]] = [("", float("inf"))] - - # Alias stack: list of alias dicts, one per scope level - # Each new scope inherits parent's aliases (copy on write) - alias_stack: List[Dict[str, str]] = [{}] - - def get_current_scope() -> str: - """Get the name of the current (innermost) scope.""" - return scope_stack[-1][0] - - def pop_scopes_before(line: int) -> None: - """Pop all scopes that have ended before the given line.""" - while len(scope_stack) > 1 and scope_stack[-1][1] < line: - scope_stack.pop() - alias_stack.pop() - - def push_scope(name: str, end_line: int) -> None: - """Push a new scope onto the stack.""" - scope_stack.append((name, end_line)) - # Copy parent scope's aliases for inheritance - alias_stack.append(dict(alias_stack[-1])) - - def update_aliases(updates: Dict[str, str]) -> None: - """Update current scope's alias map.""" - alias_stack[-1].update(updates) - - def resolve_alias(symbol: str) -> str: - """Resolve a symbol using current scope's alias map.""" - if "." not in symbol: - # Simple name - check if it's an alias - return alias_stack[-1].get(symbol, symbol) - - # Dotted name - resolve the base - parts = symbol.split(".", 1) - base = parts[0] - rest = parts[1] - - if base in alias_stack[-1]: - return f"{alias_stack[-1][base]}.{rest}" - return symbol - - for start_line, end_line, match_type, symbol, node in matches: - # Pop any scopes that have ended - pop_scopes_before(start_line) - - if match_type == "class_def": - # Push class scope - push_scope(symbol, end_line) - - elif match_type == "func_def": - # Push function scope - push_scope(symbol, end_line) - - elif match_type == "inherits": - # Record inheritance relationship - # Parse base classes from the bases text - base_classes = self._parse_base_classes(symbol) - for base_class in base_classes: - base_class = base_class.strip() - if base_class: - # Resolve alias for base class - resolved_base = resolve_alias(base_class) - relationships.append(CodeRelationship( - source_symbol=get_current_scope(), - target_symbol=resolved_base, - relationship_type=RelationshipType.INHERITS, - source_file=source_file, - target_file=None, - source_line=start_line, - )) - - elif match_type == "import": - # Process simple import statement - module = symbol - # Simple import: add base name to alias map - base_name = module.split(".", 1)[0] - update_aliases({base_name: module}) - relationships.append(CodeRelationship( - source_symbol=get_current_scope(), - target_symbol=module, - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=start_line, - )) - - elif match_type == "import_alias": - # Process import with alias: import X as Y - parts = symbol.split(":", 1) - module = parts[0] - alias = parts[1] if len(parts) > 1 else "" - if alias: - update_aliases({alias: module}) - relationships.append(CodeRelationship( - source_symbol=get_current_scope(), - target_symbol=module, - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=start_line, - )) - - elif match_type == "from_import": - # Process from-import statement - parts = symbol.split(":", 1) - module = parts[0] - names = parts[1] if len(parts) > 1 else "" - - names = (names or "").strip() - if names.startswith("(") and names.endswith(")"): - names = names[1:-1].strip() - - # Record IMPORTS edges for the imported names (module.symbol), and - # update aliases for call/usage resolution. - if names and names != "*": - for name in names.split(","): - name = name.strip() - if not name or name == "*": - continue - - if " as " in name: - as_parts = name.split(" as ", 1) - original = as_parts[0].strip() - alias = as_parts[1].strip() - if not original: - continue - target = f"{module}.{original}" if module else original - if alias: - update_aliases({alias: target}) - relationships.append(CodeRelationship( - source_symbol=get_current_scope(), - target_symbol=target, - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=start_line, - )) - else: - target = f"{module}.{name}" if module else name - update_aliases({name: target}) - relationships.append(CodeRelationship( - source_symbol=get_current_scope(), - target_symbol=target, - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=start_line, - )) - - elif match_type == "call": - # Resolve alias for call target - resolved = resolve_alias(symbol) - relationships.append(CodeRelationship( - source_symbol=get_current_scope(), - target_symbol=resolved, - relationship_type=RelationshipType.CALL, - source_file=source_file, - target_file=None, - source_line=start_line, - )) - - return relationships - - def process_matches( - self, - matches: List[SgNode], # type: ignore[valid-type] - source_code: str, - path: Path, - ) -> List[CodeRelationship]: - """Process ast-grep matches into code relationships. - - This is a simplified interface for direct match processing. - For full relationship extraction with scope tracking, use parse(). - - Args: - matches: List of matched SgNode objects - source_code: Original source code - path: File path being processed - - Returns: - List of extracted code relationships - """ - if not self.is_available() or self._binding is None: - return [] - - source_file = str(path.resolve()) - relationships: List[CodeRelationship] = [] - - for node in matches: - # Default to call relationship for generic matches - func = self._get_match(node, "FUNC") - line = self._get_line_number(node) - if func: - base = func.split(".", 1)[0] - if base not in {"self", "cls"}: - relationships.append(CodeRelationship( - source_symbol="", - target_symbol=func, - relationship_type=RelationshipType.CALL, - source_file=source_file, - target_file=None, - source_line=line, - )) - - return relationships - - def _get_match(self, node: SgNode, metavar: str) -> str: # type: ignore[valid-type] - """Extract matched metavariable value from node. - - Args: - node: SgNode with match - metavar: Metavariable name (without $ prefix) - - Returns: - Matched text or empty string - """ - if self._binding is None or node is None: - return "" - return self._binding._get_match(node, metavar) - - def _get_line_number(self, node: SgNode) -> int: # type: ignore[valid-type] - """Get starting line number of a node. - - Args: - node: SgNode to get line number for - - Returns: - 1-based line number - """ - if self._binding is None or node is None: - return 0 - return self._binding._get_line_number(node) - - def _get_line_range(self, node: SgNode) -> Tuple[int, int]: # type: ignore[valid-type] - """Get line range for a node. - - Args: - node: SgNode to get range for - - Returns: - (start_line, end_line) tuple, 1-based inclusive - """ - if self._binding is None or node is None: - return (0, 0) - return self._binding._get_line_range(node) - - - # ========================================================================= - # Dedicated extraction methods for INHERITS, CALL, IMPORTS relationships - # ========================================================================= - - def extract_inherits( - self, - source_code: str, - source_file: str, - source_symbol: str = "", - ) -> List[CodeRelationship]: - """Extract INHERITS relationships from Python code. - - Identifies class inheritance patterns including: - - Single inheritance: class Child(Parent): - - Multiple inheritance: class Child(A, B, C): - - Args: - source_code: Python source code to analyze - source_file: Path to the source file - source_symbol: The containing scope (class or module) - - Returns: - List of CodeRelationship objects with INHERITS type - """ - if not self.is_available(): - return [] - - relationships: List[CodeRelationship] = [] - - # Use class_with_bases pattern to find classes with inheritance - matches = self.run_ast_grep(source_code, get_pattern("class_with_bases")) - - for node in matches: - class_name = self._get_match(node, "NAME") - line = self._get_line_number(node) - - if class_name: - # Extract bases from the node text (first line: "class ClassName(Base1, Base2):") - # ast-grep-py 0.40+ doesn't capture $$$ multi-matches, so parse from text - node_text = self._binding._get_node_text(node) if self._binding else "" - bases_text = self._extract_bases_from_class_text(node_text) - - if bases_text: - # Parse individual base classes from the bases text - base_classes = self._parse_base_classes(bases_text) - - for base_class in base_classes: - base_class = base_class.strip() - if base_class: - relationships.append(CodeRelationship( - source_symbol=class_name, - target_symbol=base_class, - relationship_type=RelationshipType.INHERITS, - source_file=source_file, - target_file=None, - source_line=line, - )) - - return relationships - - def _extract_bases_from_class_text(self, class_text: str) -> str: - """Extract base classes text from class definition. - - Args: - class_text: Full text of class definition (e.g., "class Dog(Animal):\\n pass") - - Returns: - Text inside parentheses (e.g., "Animal") or empty string - """ - import re - # Match "class Name(BASES):" - extract BASES - match = re.search(r'class\s+\w+\s*\(([^)]*)\)\s*:', class_text) - if match: - return match.group(1).strip() - return "" - - def _extract_import_names_from_text(self, import_text: str) -> str: - """Extract imported names from from-import statement. - - Args: - import_text: Full text of import statement (e.g., "from typing import List, Dict") - - Returns: - Names text (e.g., "List, Dict") or empty string - """ - import re - # Match "from MODULE import NAMES" - extract NAMES - match = re.search(r'from\s+[\w.]+\s+import\s+(.+)$', import_text, re.MULTILINE) - if match: - return match.group(1).strip() - return "" - - def extract_calls( - self, - source_code: str, - source_file: str, - source_symbol: str = "", - alias_map: Optional[Dict[str, str]] = None, - ) -> List[CodeRelationship]: - """Extract CALL relationships from Python code. - - Identifies function and method call patterns including: - - Simple calls: func() - - Calls with arguments: func(arg1, arg2) - - Method calls: obj.method() - - Chained calls: obj.method1().method2() - - Args: - source_code: Python source code to analyze - source_file: Path to the source file - source_symbol: The containing scope (class or module) - alias_map: Optional alias map for resolving imported names - - Returns: - List of CodeRelationship objects with CALL type - """ - if not self.is_available(): - return [] - - relationships: List[CodeRelationship] = [] - alias_map = alias_map or {} - - # Use the generic call pattern - matches = self.run_ast_grep(source_code, get_pattern("call")) - - for node in matches: - func = self._get_match(node, "FUNC") - line = self._get_line_number(node) - - if func: - # Skip self. and cls. prefixed calls (internal method calls) - base = func.split(".", 1)[0] - if base in {"self", "cls", "super"}: - continue - - # Resolve alias if available - resolved = self._resolve_call_alias(func, alias_map) - - relationships.append(CodeRelationship( - source_symbol=source_symbol, - target_symbol=resolved, - relationship_type=RelationshipType.CALL, - source_file=source_file, - target_file=None, - source_line=line, - )) - - return relationships - - def extract_imports( - self, - source_code: str, - source_file: str, - source_symbol: str = "", - ) -> Tuple[List[CodeRelationship], Dict[str, str]]: - """Extract IMPORTS relationships from Python code. - - Identifies import patterns including: - - Simple import: import os - - Import with alias: import numpy as np - - From import: from typing import List - - From import with alias: from collections import defaultdict as dd - - Relative import: from .module import func - - Star import: from module import * - - Args: - source_code: Python source code to analyze - source_file: Path to the source file - source_symbol: The containing scope (class or module) - - Returns: - Tuple of: - - List of CodeRelationship objects with IMPORTS type - - Dict mapping local names to fully qualified module names (alias map) - """ - if not self.is_available(): - return [], {} - - relationships: List[CodeRelationship] = [] - alias_map: Dict[str, str] = {} - - # Track processed lines to avoid duplicates - processed_lines: set = set() - - # Process import with alias FIRST: import X as Y - alias_matches = self.run_ast_grep(source_code, get_pattern("import_with_alias")) - for node in alias_matches: - module = self._get_match(node, "MODULE") - alias = self._get_match(node, "ALIAS") - line = self._get_line_number(node) - - if module and alias: - alias_map[alias] = module - processed_lines.add(line) - - relationships.append(CodeRelationship( - source_symbol=source_symbol, - target_symbol=module, - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=line, - )) - - # Process simple imports: import X (skip lines already processed) - import_matches = self.run_ast_grep(source_code, get_pattern("import_stmt")) - for node in import_matches: - module = self._get_match(node, "MODULE") - line = self._get_line_number(node) - - if module and line not in processed_lines: - # Add to alias map: first part of module - base_name = module.split(".", 1)[0] - alias_map[base_name] = module - - relationships.append(CodeRelationship( - source_symbol=source_symbol, - target_symbol=module, - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=line, - )) - - # Process from imports: from X import Y - from_matches = self.run_ast_grep(source_code, get_pattern("import_from")) - for node in from_matches: - module = self._get_match(node, "MODULE") - line = self._get_line_number(node) - - if module: - # Add relationship for the module - relationships.append(CodeRelationship( - source_symbol=source_symbol, - target_symbol=module, - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=line, - )) - - # Parse names from node text (ast-grep-py 0.40+ doesn't capture $$$ multi-match) - node_text = self._binding._get_node_text(node) if self._binding else "" - names = self._extract_import_names_from_text(node_text) - - # Add aliases for imported names - if names and names != "*": - for name in names.split(","): - name = name.strip() - # Handle "name as alias" syntax - if " as " in name: - parts = name.split(" as ") - original = parts[0].strip() - alias = parts[1].strip() - alias_map[alias] = f"{module}.{original}" - elif name: - alias_map[name] = f"{module}.{name}" - - # Process star imports: from X import * - star_matches = self.run_ast_grep(source_code, get_pattern("from_import_star")) - for node in star_matches: - module = self._get_match(node, "MODULE") - line = self._get_line_number(node) - - if module: - relationships.append(CodeRelationship( - source_symbol=source_symbol, - target_symbol=f"{module}.*", - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=line, - )) - - # Process relative imports: from .X import Y - relative_matches = self.run_ast_grep(source_code, get_pattern("relative_import")) - for node in relative_matches: - module = self._get_match(node, "MODULE") - names = self._get_match(node, "NAMES") - line = self._get_line_number(node) - - # Prepend dot for relative module path - rel_module = f".{module}" if module else "." - - relationships.append(CodeRelationship( - source_symbol=source_symbol, - target_symbol=rel_module, - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=line, - )) - - return relationships, alias_map - - # ========================================================================= - # Helper methods for pattern processing - # ========================================================================= - - def _parse_base_classes(self, bases_text: str) -> List[str]: - """Parse base class names from inheritance text. - - Handles single and multiple inheritance with proper comma splitting. - Accounts for nested parentheses and complex type annotations. - - Args: - bases_text: Text inside the parentheses of class definition - - Returns: - List of base class names - """ - if not bases_text: - return [] - - # Simple comma split (may not handle all edge cases) - bases = [] - depth = 0 - current = [] - - for char in bases_text: - if char == "(": - depth += 1 - current.append(char) - elif char == ")": - depth -= 1 - current.append(char) - elif char == "," and depth == 0: - base = "".join(current).strip() - if base: - bases.append(base) - current = [] - else: - current.append(char) - - # Add the last base class - if current: - base = "".join(current).strip() - if base: - bases.append(base) - - return bases - - def _resolve_call_alias(self, func_name: str, alias_map: Dict[str, str]) -> str: - """Resolve a function call name using import aliases. - - Args: - func_name: The function/method name as it appears in code - alias_map: Mapping of local names to fully qualified names - - Returns: - Resolved function name (fully qualified if possible) - """ - if "." not in func_name: - # Simple function call - check if it's an alias - return alias_map.get(func_name, func_name) - - # Method call or qualified name - resolve the base - parts = func_name.split(".", 1) - base = parts[0] - rest = parts[1] - - if base in alias_map: - return f"{alias_map[base]}.{rest}" - - return func_name - - -def is_astgrep_processor_available() -> bool: - """Check if ast-grep processor is available. - - Returns: - True if ast-grep-py is installed and processor can be used - """ - return ASTGREP_AVAILABLE - - -__all__ = [ - "BaseAstGrepProcessor", - "AstGrepPythonProcessor", - "is_astgrep_processor_available", -] diff --git a/codex-lens/src/codexlens/parsers/encoding.py b/codex-lens/src/codexlens/parsers/encoding.py deleted file mode 100644 index b796d24b..00000000 --- a/codex-lens/src/codexlens/parsers/encoding.py +++ /dev/null @@ -1,202 +0,0 @@ -"""Optional encoding detection module for CodexLens. - -Provides automatic encoding detection with graceful fallback to UTF-8. -Install with: pip install codexlens[encoding] -""" - -from __future__ import annotations - -import logging -from pathlib import Path -from typing import Tuple, Optional - -log = logging.getLogger(__name__) - -# Feature flag for encoding detection availability -ENCODING_DETECTION_AVAILABLE = False -_import_error: Optional[str] = None - - -def _detect_chardet_backend() -> Tuple[bool, Optional[str]]: - """Detect if chardet or charset-normalizer is available.""" - try: - import chardet - return True, None - except ImportError: - pass - - try: - from charset_normalizer import from_bytes - return True, None - except ImportError: - pass - - return False, "chardet not available. Install with: pip install codexlens[encoding]" - - -# Initialize on module load -ENCODING_DETECTION_AVAILABLE, _import_error = _detect_chardet_backend() - - -def check_encoding_available() -> Tuple[bool, Optional[str]]: - """Check if encoding detection dependencies are available. - - Returns: - Tuple of (available, error_message) - """ - return ENCODING_DETECTION_AVAILABLE, _import_error - - -def detect_encoding(content_bytes: bytes, confidence_threshold: float = 0.7) -> str: - """Detect encoding from file content bytes. - - Uses chardet or charset-normalizer with configurable confidence threshold. - Falls back to UTF-8 if confidence is too low or detection unavailable. - - Args: - content_bytes: Raw file content as bytes - confidence_threshold: Minimum confidence (0.0-1.0) to accept detection - - Returns: - Detected encoding name (e.g., 'utf-8', 'iso-8859-1', 'gbk') - Returns 'utf-8' as fallback if detection fails or confidence too low - """ - if not ENCODING_DETECTION_AVAILABLE: - log.debug("Encoding detection not available, using UTF-8 fallback") - return "utf-8" - - if not content_bytes: - return "utf-8" - - try: - # Try chardet first - try: - import chardet - result = chardet.detect(content_bytes) - encoding = result.get("encoding") - confidence = result.get("confidence", 0.0) - - if encoding and confidence >= confidence_threshold: - log.debug(f"Detected encoding: {encoding} (confidence: {confidence:.2f})") - # Normalize encoding name: replace underscores with hyphens - return encoding.lower().replace('_', '-') - else: - log.debug( - f"Low confidence encoding detection: {encoding} " - f"(confidence: {confidence:.2f}), using UTF-8 fallback" - ) - return "utf-8" - except ImportError: - pass - - # Fallback to charset-normalizer - try: - from charset_normalizer import from_bytes - results = from_bytes(content_bytes) - if results: - best = results.best() - if best and best.encoding: - log.debug(f"Detected encoding via charset-normalizer: {best.encoding}") - # Normalize encoding name: replace underscores with hyphens - return best.encoding.lower().replace('_', '-') - except ImportError: - pass - - except Exception as e: - log.warning(f"Encoding detection failed: {e}, using UTF-8 fallback") - - return "utf-8" - - -def read_file_safe( - path: Path | str, - confidence_threshold: float = 0.7, - max_detection_bytes: int = 100_000 -) -> Tuple[str, str]: - """Read file with automatic encoding detection and safe decoding. - - Reads file bytes, detects encoding, and decodes with error replacement - to preserve file structure even with encoding issues. - - Args: - path: Path to file to read - confidence_threshold: Minimum confidence for encoding detection - max_detection_bytes: Maximum bytes to use for encoding detection (default 100KB) - - Returns: - Tuple of (content, detected_encoding) - - content: Decoded file content (with � for unmappable bytes) - - detected_encoding: Detected encoding name - - Raises: - OSError: If file cannot be read - IsADirectoryError: If path is a directory - """ - file_path = Path(path) if isinstance(path, str) else path - - # Read file bytes - try: - content_bytes = file_path.read_bytes() - except Exception as e: - log.error(f"Failed to read file {file_path}: {e}") - raise - - # Detect encoding from first N bytes for performance - detection_sample = content_bytes[:max_detection_bytes] if len(content_bytes) > max_detection_bytes else content_bytes - encoding = detect_encoding(detection_sample, confidence_threshold) - - # Decode with error replacement to preserve structure - try: - content = content_bytes.decode(encoding, errors='replace') - log.debug(f"Successfully decoded {file_path} using {encoding}") - return content, encoding - except Exception as e: - # Final fallback to UTF-8 with replacement - log.warning(f"Failed to decode {file_path} with {encoding}, using UTF-8: {e}") - content = content_bytes.decode('utf-8', errors='replace') - return content, 'utf-8' - - -def is_binary_file(path: Path | str, sample_size: int = 8192) -> bool: - """Check if file is likely binary by sampling first bytes. - - Uses heuristic: if >30% of sample bytes are null or non-text, consider binary. - - Args: - path: Path to file to check - sample_size: Number of bytes to sample (default 8KB) - - Returns: - True if file appears to be binary, False otherwise - """ - file_path = Path(path) if isinstance(path, str) else path - - try: - with file_path.open('rb') as f: - sample = f.read(sample_size) - - if not sample: - return False - - # Count null bytes and non-printable characters - null_count = sample.count(b'\x00') - non_text_count = sum(1 for byte in sample if byte < 0x20 and byte not in (0x09, 0x0a, 0x0d)) - - # If >30% null bytes or >50% non-text, consider binary - null_ratio = null_count / len(sample) - non_text_ratio = non_text_count / len(sample) - - return null_ratio > 0.3 or non_text_ratio > 0.5 - - except Exception as e: - log.debug(f"Binary check failed for {file_path}: {e}, assuming text") - return False - - -__all__ = [ - "ENCODING_DETECTION_AVAILABLE", - "check_encoding_available", - "detect_encoding", - "read_file_safe", - "is_binary_file", -] diff --git a/codex-lens/src/codexlens/parsers/factory.py b/codex-lens/src/codexlens/parsers/factory.py deleted file mode 100644 index 5b07a4bc..00000000 --- a/codex-lens/src/codexlens/parsers/factory.py +++ /dev/null @@ -1,393 +0,0 @@ -"""Parser factory for CodexLens. - -Python and JavaScript/TypeScript parsing use Tree-Sitter grammars when -available. Regex fallbacks are retained to preserve the existing parser -interface and behavior in minimal environments. -""" - -from __future__ import annotations - -import re -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List, Optional, Protocol - -from codexlens.config import Config -from codexlens.entities import CodeRelationship, IndexedFile, RelationshipType, Symbol -from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser - - -class Parser(Protocol): - def parse(self, text: str, path: Path) -> IndexedFile: ... - - -@dataclass -class SimpleRegexParser: - language_id: str - config: Optional[Config] = None - - def parse(self, text: str, path: Path) -> IndexedFile: - # Try tree-sitter first for supported languages - if self.language_id in {"python", "javascript", "typescript"}: - ts_parser = TreeSitterSymbolParser( - self.language_id, - path, - config=self.config, - ) - if ts_parser.is_available(): - indexed = ts_parser.parse(text, path) - if indexed is not None: - return indexed - - # Fallback to regex parsing - if self.language_id == "python": - symbols = _parse_python_symbols_regex(text) - relationships = _parse_python_relationships_regex(text, path) - elif self.language_id in {"javascript", "typescript"}: - symbols = _parse_js_ts_symbols_regex(text) - relationships = _parse_js_ts_relationships_regex(text, path) - elif self.language_id == "java": - symbols = _parse_java_symbols(text) - relationships = [] - elif self.language_id == "go": - symbols = _parse_go_symbols(text) - relationships = [] - elif self.language_id == "markdown": - symbols = _parse_markdown_symbols(text) - relationships = [] - elif self.language_id == "text": - symbols = _parse_text_symbols(text) - relationships = [] - else: - symbols = _parse_generic_symbols(text) - relationships = [] - - return IndexedFile( - path=str(path.resolve()), - language=self.language_id, - symbols=symbols, - chunks=[], - relationships=relationships, - ) - - -class ParserFactory: - def __init__(self, config: Config) -> None: - self.config = config - self._parsers: Dict[str, Parser] = {} - - def get_parser(self, language_id: str) -> Parser: - if language_id not in self._parsers: - self._parsers[language_id] = SimpleRegexParser( - language_id, - config=self.config, - ) - return self._parsers[language_id] - - -# Regex-based fallback parsers -_PY_CLASS_RE = re.compile(r"^\s*class\s+([A-Za-z_]\w*)\b") -_PY_DEF_RE = re.compile(r"^\s*(?:async\s+)?def\s+([A-Za-z_]\w*)\s*\(") - -_PY_IMPORT_RE = re.compile(r"^(?:from\s+([\w.]+)\s+)?import\s+([\w.,\s]+)") -_PY_CALL_RE = re.compile(r"(? List[Symbol]: - """Parse Python symbols, using tree-sitter if available, regex fallback.""" - ts_parser = TreeSitterSymbolParser("python") - if ts_parser.is_available(): - symbols = ts_parser.parse_symbols(text) - if symbols is not None: - return symbols - return _parse_python_symbols_regex(text) - - -def _parse_js_ts_symbols( - text: str, - language_id: str = "javascript", - path: Optional[Path] = None, -) -> List[Symbol]: - """Parse JS/TS symbols, using tree-sitter if available, regex fallback.""" - ts_parser = TreeSitterSymbolParser(language_id, path) - if ts_parser.is_available(): - symbols = ts_parser.parse_symbols(text) - if symbols is not None: - return symbols - return _parse_js_ts_symbols_regex(text) - - -def _parse_python_symbols_regex(text: str) -> List[Symbol]: - symbols: List[Symbol] = [] - current_class_indent: Optional[int] = None - for i, line in enumerate(text.splitlines(), start=1): - class_match = _PY_CLASS_RE.match(line) - if class_match: - current_class_indent = len(line) - len(line.lstrip(" ")) - symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i))) - continue - def_match = _PY_DEF_RE.match(line) - if def_match: - indent = len(line) - len(line.lstrip(" ")) - kind = "method" if current_class_indent is not None and indent > current_class_indent else "function" - symbols.append(Symbol(name=def_match.group(1), kind=kind, range=(i, i))) - continue - if current_class_indent is not None: - indent = len(line) - len(line.lstrip(" ")) - if line.strip() and indent <= current_class_indent: - current_class_indent = None - return symbols - - -def _parse_python_relationships_regex(text: str, path: Path) -> List[CodeRelationship]: - relationships: List[CodeRelationship] = [] - current_scope: str | None = None - source_file = str(path.resolve()) - - for line_num, line in enumerate(text.splitlines(), start=1): - class_match = _PY_CLASS_RE.match(line) - if class_match: - current_scope = class_match.group(1) - continue - - def_match = _PY_DEF_RE.match(line) - if def_match: - current_scope = def_match.group(1) - continue - - if current_scope is None: - continue - - import_match = _PY_IMPORT_RE.search(line) - if import_match: - import_target = import_match.group(1) or import_match.group(2) - if import_target: - relationships.append( - CodeRelationship( - source_symbol=current_scope, - target_symbol=import_target.strip(), - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=line_num, - ) - ) - - for call_match in _PY_CALL_RE.finditer(line): - call_name = call_match.group(1) - if call_name in { - "if", - "for", - "while", - "return", - "print", - "len", - "str", - "int", - "float", - "list", - "dict", - "set", - "tuple", - current_scope, - }: - continue - relationships.append( - CodeRelationship( - source_symbol=current_scope, - target_symbol=call_name, - relationship_type=RelationshipType.CALL, - source_file=source_file, - target_file=None, - source_line=line_num, - ) - ) - - return relationships - - -_JS_FUNC_RE = re.compile(r"^\s*(?:export\s+)?(?:async\s+)?function\s+([A-Za-z_$][\w$]*)\s*\(") -_JS_CLASS_RE = re.compile(r"^\s*(?:export\s+)?class\s+([A-Za-z_$][\w$]*)\b") -_JS_ARROW_RE = re.compile( - r"^\s*(?:export\s+)?(?:const|let|var)\s+([A-Za-z_$][\w$]*)\s*=\s*(?:async\s*)?\(?[^)]*\)?\s*=>" -) -_JS_METHOD_RE = re.compile(r"^\s+(?:async\s+)?([A-Za-z_$][\w$]*)\s*\([^)]*\)\s*\{") -_JS_IMPORT_RE = re.compile(r"import\s+.*\s+from\s+['\"]([^'\"]+)['\"]") -_JS_CALL_RE = re.compile(r"(? List[Symbol]: - symbols: List[Symbol] = [] - in_class = False - class_brace_depth = 0 - brace_depth = 0 - - for i, line in enumerate(text.splitlines(), start=1): - brace_depth += line.count("{") - line.count("}") - - class_match = _JS_CLASS_RE.match(line) - if class_match: - symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i))) - in_class = True - class_brace_depth = brace_depth - continue - - if in_class and brace_depth < class_brace_depth: - in_class = False - - func_match = _JS_FUNC_RE.match(line) - if func_match: - symbols.append(Symbol(name=func_match.group(1), kind="function", range=(i, i))) - continue - - arrow_match = _JS_ARROW_RE.match(line) - if arrow_match: - symbols.append(Symbol(name=arrow_match.group(1), kind="function", range=(i, i))) - continue - - if in_class: - method_match = _JS_METHOD_RE.match(line) - if method_match: - name = method_match.group(1) - if name != "constructor": - symbols.append(Symbol(name=name, kind="method", range=(i, i))) - - return symbols - - -def _parse_js_ts_relationships_regex(text: str, path: Path) -> List[CodeRelationship]: - relationships: List[CodeRelationship] = [] - current_scope: str | None = None - source_file = str(path.resolve()) - - for line_num, line in enumerate(text.splitlines(), start=1): - class_match = _JS_CLASS_RE.match(line) - if class_match: - current_scope = class_match.group(1) - continue - - func_match = _JS_FUNC_RE.match(line) - if func_match: - current_scope = func_match.group(1) - continue - - arrow_match = _JS_ARROW_RE.match(line) - if arrow_match: - current_scope = arrow_match.group(1) - continue - - if current_scope is None: - continue - - import_match = _JS_IMPORT_RE.search(line) - if import_match: - relationships.append( - CodeRelationship( - source_symbol=current_scope, - target_symbol=import_match.group(1), - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=line_num, - ) - ) - - for call_match in _JS_CALL_RE.finditer(line): - call_name = call_match.group(1) - if call_name in {current_scope}: - continue - relationships.append( - CodeRelationship( - source_symbol=current_scope, - target_symbol=call_name, - relationship_type=RelationshipType.CALL, - source_file=source_file, - target_file=None, - source_line=line_num, - ) - ) - - return relationships - - -_JAVA_CLASS_RE = re.compile(r"^\s*(?:public\s+)?class\s+([A-Za-z_]\w*)\b") -_JAVA_METHOD_RE = re.compile( - r"^\s*(?:public|private|protected|static|\s)+[\w<>\[\]]+\s+([A-Za-z_]\w*)\s*\(" -) - - -def _parse_java_symbols(text: str) -> List[Symbol]: - symbols: List[Symbol] = [] - for i, line in enumerate(text.splitlines(), start=1): - class_match = _JAVA_CLASS_RE.match(line) - if class_match: - symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i))) - continue - method_match = _JAVA_METHOD_RE.match(line) - if method_match: - symbols.append(Symbol(name=method_match.group(1), kind="method", range=(i, i))) - return symbols - - -_GO_FUNC_RE = re.compile(r"^\s*func\s+(?:\([^)]+\)\s+)?([A-Za-z_]\w*)\s*\(") -_GO_TYPE_RE = re.compile(r"^\s*type\s+([A-Za-z_]\w*)\s+(?:struct|interface)\b") - - -def _parse_go_symbols(text: str) -> List[Symbol]: - symbols: List[Symbol] = [] - for i, line in enumerate(text.splitlines(), start=1): - type_match = _GO_TYPE_RE.match(line) - if type_match: - symbols.append(Symbol(name=type_match.group(1), kind="class", range=(i, i))) - continue - func_match = _GO_FUNC_RE.match(line) - if func_match: - symbols.append(Symbol(name=func_match.group(1), kind="function", range=(i, i))) - return symbols - - -_GENERIC_DEF_RE = re.compile(r"^\s*(?:def|function|func)\s+([A-Za-z_]\w*)\b") -_GENERIC_CLASS_RE = re.compile(r"^\s*(?:class|struct|interface)\s+([A-Za-z_]\w*)\b") - - -def _parse_generic_symbols(text: str) -> List[Symbol]: - symbols: List[Symbol] = [] - for i, line in enumerate(text.splitlines(), start=1): - class_match = _GENERIC_CLASS_RE.match(line) - if class_match: - symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i))) - continue - def_match = _GENERIC_DEF_RE.match(line) - if def_match: - symbols.append(Symbol(name=def_match.group(1), kind="function", range=(i, i))) - return symbols - - -# Markdown heading regex: # Heading, ## Heading, etc. -_MD_HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$") - - -def _parse_markdown_symbols(text: str) -> List[Symbol]: - """Parse Markdown headings as symbols. - - Extracts # headings as 'section' symbols with heading level as kind suffix. - """ - symbols: List[Symbol] = [] - for i, line in enumerate(text.splitlines(), start=1): - heading_match = _MD_HEADING_RE.match(line) - if heading_match: - level = len(heading_match.group(1)) - title = heading_match.group(2).strip() - # Use 'section' kind with level indicator - kind = f"h{level}" - symbols.append(Symbol(name=title, kind=kind, range=(i, i))) - return symbols - - -def _parse_text_symbols(text: str) -> List[Symbol]: - """Parse plain text files - no symbols, just index content.""" - # Text files don't have structured symbols, return empty list - # The file content will still be indexed for FTS search - return [] diff --git a/codex-lens/src/codexlens/parsers/patterns/__init__.py b/codex-lens/src/codexlens/parsers/patterns/__init__.py deleted file mode 100644 index 10717360..00000000 --- a/codex-lens/src/codexlens/parsers/patterns/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""ast-grep pattern definitions for various languages. - -This package contains language-specific pattern definitions for -extracting code relationships using ast-grep declarative patterns. -""" diff --git a/codex-lens/src/codexlens/parsers/patterns/javascript/__init__.py b/codex-lens/src/codexlens/parsers/patterns/javascript/__init__.py deleted file mode 100644 index a95608ea..00000000 --- a/codex-lens/src/codexlens/parsers/patterns/javascript/__init__.py +++ /dev/null @@ -1,92 +0,0 @@ -"""JavaScript ast-grep patterns for relationship extraction. - -These patterns are used by CodexLens' optional ast-grep processors to extract: -- IMPORTS: ES module imports + CommonJS require() -- INHERITS: class extends relationships - -Pattern Syntax (ast-grep-py 0.40+): - $VAR - Single metavariable (matches one AST node) - $$$VAR - Multiple metavariable (matches zero or more nodes) -""" - -from __future__ import annotations - -from typing import Dict, List - - -JAVASCRIPT_PATTERNS: Dict[str, str] = { - # ES module imports - # import React from "react" - # import React, { useEffect } from "react" - # import { useEffect } from "react" - # import * as fs from "fs" - "import_from_dq": "import $$$IMPORTS from \"$MODULE\"", - "import_from_sq": "import $$$IMPORTS from '$MODULE'", - "import_named_only_dq": "import {$$$NAMES} from \"$MODULE\"", - "import_named_only_sq": "import {$$$NAMES} from '$MODULE'", - "import_default_named_dq": "import $DEFAULT, {$$$NAMES} from \"$MODULE\"", - "import_default_named_sq": "import $DEFAULT, {$$$NAMES} from '$MODULE'", - # Side-effect import: import "./styles.css" - "import_side_effect_dq": "import \"$MODULE\"", - "import_side_effect_sq": "import '$MODULE'", - - # CommonJS require(): const fs = require("fs") - "require_call_dq": "require(\"$MODULE\")", - "require_call_sq": "require('$MODULE')", - - # Class inheritance: class Child extends Base {} - # Note: `{...}` form matches both JS and TS grammars more reliably. - "class_extends": "class $NAME extends $BASE {$$$BODY}", -} - - -METAVARS = { - "module": "MODULE", - "import_names": "NAMES", - "import_default": "DEFAULT", - "class_name": "NAME", - "class_base": "BASE", -} - - -RELATIONSHIP_PATTERNS: Dict[str, List[str]] = { - "imports": [ - "import_from_dq", - "import_from_sq", - "import_named_only_dq", - "import_named_only_sq", - "import_default_named_dq", - "import_default_named_sq", - "import_side_effect_dq", - "import_side_effect_sq", - "require_call_dq", - "require_call_sq", - ], - "inheritance": ["class_extends"], -} - - -def get_pattern(pattern_name: str) -> str: - if pattern_name not in JAVASCRIPT_PATTERNS: - raise KeyError( - f"Unknown JS pattern: {pattern_name}. Available: {list(JAVASCRIPT_PATTERNS.keys())}" - ) - return JAVASCRIPT_PATTERNS[pattern_name] - - -def get_patterns_for_relationship(rel_type: str) -> List[str]: - return RELATIONSHIP_PATTERNS.get(rel_type, []) - - -def get_metavar(name: str) -> str: - return METAVARS.get(name, name.upper()) - - -__all__ = [ - "JAVASCRIPT_PATTERNS", - "METAVARS", - "RELATIONSHIP_PATTERNS", - "get_pattern", - "get_patterns_for_relationship", - "get_metavar", -] diff --git a/codex-lens/src/codexlens/parsers/patterns/python/__init__.py b/codex-lens/src/codexlens/parsers/patterns/python/__init__.py deleted file mode 100644 index c8d6526e..00000000 --- a/codex-lens/src/codexlens/parsers/patterns/python/__init__.py +++ /dev/null @@ -1,204 +0,0 @@ -"""Python ast-grep patterns for relationship extraction. - -This module defines declarative patterns for extracting code relationships -(inheritance, calls, imports) from Python source code using ast-grep. - -Pattern Syntax (ast-grep-py 0.40+): - $VAR - Single metavariable (matches one AST node) - $$$VAR - Multiple metavariable (matches zero or more nodes) - -Example: - "class $CLASS_NAME($$$BASES) $$$BODY" matches: - class MyClass(BaseClass): - pass - with $CLASS_NAME = "MyClass", $$$BASES = "BaseClass", $$$BODY = "pass" - -YAML Pattern Files: - inherits.yaml - INHERITS relationship patterns (single/multiple inheritance) - imports.yaml - IMPORTS relationship patterns (import, from...import, as) - call.yaml - CALL relationship patterns (function/method calls) -""" - -from __future__ import annotations - -from pathlib import Path -from typing import Dict, List, Optional - -# Directory containing YAML pattern files -PATTERNS_DIR = Path(__file__).parent - -# Python ast-grep patterns organized by relationship type -# Note: ast-grep-py 0.40+ uses $$$ for zero-or-more multi-match -PYTHON_PATTERNS: Dict[str, str] = { - # Class definitions with inheritance - "class_def": "class $NAME $$$BODY", - "class_with_bases": "class $NAME($$$BASES) $$$BODY", - - # Single inheritance: class Child(Parent): - "single_inheritance": "class $CLASS_NAME($BASE) $$$BODY", - - # Multiple inheritance: class Child(A, B, C): - "multiple_inheritance": "class $CLASS_NAME($BASE, $$$MORE_BASES) $$$BODY", - - # Function definitions (use $$$ for zero-or-more params) - "func_def": "def $NAME($$$PARAMS): $$$BODY", - "async_func_def": "async def $NAME($$$PARAMS): $$$BODY", - - # Import statements - basic forms - "import_stmt": "import $MODULE", - "import_from": "from $MODULE import $NAMES", - - # Import statements - extended forms - "import_with_alias": "import $MODULE as $ALIAS", - "import_multiple": "import $FIRST, $$$REST", - "from_import_single": "from $MODULE import $NAME", - "from_import_with_alias": "from $MODULE import $NAME as $ALIAS", - "from_import_multiple": "from $MODULE import $FIRST, $$$REST", - "from_import_star": "from $MODULE import *", - "relative_import": "from .$$$MODULE import $NAMES", - - # Function/method calls - basic form (use $$$ for zero-or-more args) - "call": "$FUNC($$$ARGS)", - "method_call": "$OBJ.$METHOD($$$ARGS)", - - # Function/method calls - specific forms - "simple_call": "$FUNC()", - "call_with_args": "$FUNC($$$ARGS)", - "chained_call": "$OBJ.$METHOD($$$ARGS).$$$CHAIN", - "constructor_call": "$CLASS($$$ARGS)", -} - -# Metavariable names for extracting match data -METAVARS = { - # Class patterns - "class_name": "NAME", - "class_bases": "BASES", - "class_body": "BODY", - "inherit_class": "CLASS_NAME", - "inherit_base": "BASE", - "inherit_more_bases": "MORE_BASES", - - # Function patterns - "func_name": "NAME", - "func_params": "PARAMS", - "func_body": "BODY", - - # Import patterns - "import_module": "MODULE", - "import_names": "NAMES", - "import_alias": "ALIAS", - "import_first": "FIRST", - "import_rest": "REST", - - # Call patterns - "call_func": "FUNC", - "call_obj": "OBJ", - "call_method": "METHOD", - "call_args": "ARGS", - "call_class": "CLASS", - "call_chain": "CHAIN", -} - -# Relationship pattern mapping - expanded for new patterns -RELATIONSHIP_PATTERNS: Dict[str, List[str]] = { - "inheritance": ["class_with_bases", "single_inheritance", "multiple_inheritance"], - "imports": [ - "import_stmt", "import_from", - "import_with_alias", "import_multiple", - "from_import_single", "from_import_with_alias", - "from_import_multiple", "from_import_star", - "relative_import", - ], - "calls": ["call", "method_call", "simple_call", "call_with_args", "constructor_call"], -} - -# YAML pattern file mapping -YAML_PATTERN_FILES = { - "inheritance": "inherits.yaml", - "imports": "imports.yaml", - "calls": "call.yaml", -} - - -def get_pattern(pattern_name: str) -> str: - """Get an ast-grep pattern by name. - - Args: - pattern_name: Key from PYTHON_PATTERNS dict - - Returns: - Pattern string - - Raises: - KeyError: If pattern name not found - """ - if pattern_name not in PYTHON_PATTERNS: - raise KeyError(f"Unknown pattern: {pattern_name}. Available: {list(PYTHON_PATTERNS.keys())}") - return PYTHON_PATTERNS[pattern_name] - - -def get_patterns_for_relationship(rel_type: str) -> List[str]: - """Get all patterns that can extract a given relationship type. - - Args: - rel_type: Relationship type (inheritance, imports, calls) - - Returns: - List of pattern names - """ - return RELATIONSHIP_PATTERNS.get(rel_type, []) - - -def get_metavar(name: str) -> str: - """Get metavariable name without $ prefix. - - Args: - name: Key from METAVARS dict - - Returns: - Metavariable name (e.g., "NAME" not "$NAME") - """ - return METAVARS.get(name, name.upper()) - - -def get_yaml_pattern_path(rel_type: str) -> Optional[Path]: - """Get the path to a YAML pattern file for a relationship type. - - Args: - rel_type: Relationship type (inheritance, imports, calls) - - Returns: - Path to YAML file or None if not found - """ - filename = YAML_PATTERN_FILES.get(rel_type) - if filename: - return PATTERNS_DIR / filename - return None - - -def list_yaml_pattern_files() -> Dict[str, Path]: - """List all available YAML pattern files. - - Returns: - Dict mapping relationship type to YAML file path - """ - result = {} - for rel_type, filename in YAML_PATTERN_FILES.items(): - path = PATTERNS_DIR / filename - if path.exists(): - result[rel_type] = path - return result - - -__all__ = [ - "PYTHON_PATTERNS", - "METAVARS", - "RELATIONSHIP_PATTERNS", - "YAML_PATTERN_FILES", - "PATTERNS_DIR", - "get_pattern", - "get_patterns_for_relationship", - "get_metavar", - "get_yaml_pattern_path", - "list_yaml_pattern_files", -] diff --git a/codex-lens/src/codexlens/parsers/patterns/python/call.yaml b/codex-lens/src/codexlens/parsers/patterns/python/call.yaml deleted file mode 100644 index 1b1bd828..00000000 --- a/codex-lens/src/codexlens/parsers/patterns/python/call.yaml +++ /dev/null @@ -1,87 +0,0 @@ -# Python CALL patterns for ast-grep -# Extracts function and method call expressions - -# Pattern metadata -id: python-call -language: python -description: Extract function and method calls from Python code - -patterns: - # Simple function call - # Matches: func() - - id: simple_call - pattern: "$FUNC()" - message: "Found simple function call" - severity: hint - - # Function call with arguments - # Matches: func(arg1, arg2) - - id: call_with_args - pattern: "$FUNC($$$ARGS)" - message: "Found function call with arguments" - severity: hint - - # Method call - # Matches: obj.method() - - id: method_call - pattern: "$OBJ.$METHOD($$$ARGS)" - message: "Found method call" - severity: hint - - # Chained method call - # Matches: obj.method1().method2() - - id: chained_call - pattern: "$OBJ.$METHOD($$$ARGS).$$$CHAIN" - message: "Found chained method call" - severity: hint - - # Call with keyword arguments - # Matches: func(arg=value) - - id: call_with_kwargs - pattern: "$FUNC($$$ARGS, $KWARG=$VALUE$$$MORE)" - message: "Found call with keyword argument" - severity: hint - - # Constructor call - # Matches: ClassName() - - id: constructor_call - pattern: "$CLASS($$$ARGS)" - message: "Found constructor call" - severity: hint - - # Subscript call (not a real call, but often confused) - # This pattern helps exclude indexing from calls - - id: subscript_access - pattern: "$OBJ[$INDEX]" - message: "Found subscript access" - severity: hint - -# Metavariables used: -# $FUNC - Function name being called -# $OBJ - Object receiving the method call -# $METHOD - Method name being called -# $ARGS - Positional arguments -# $KWARG - Keyword argument name -# $VALUE - Keyword argument value -# $CLASS - Class name for constructor calls -# $INDEX - Index for subscript access -# $$$MORE - Additional arguments -# $$$CHAIN - Additional method chains - -# Note: The generic call pattern "$FUNC($$$ARGS)" will match all function calls -# including method calls and constructor calls. More specific patterns help -# categorize the type of call. - -# Examples matched: -# print("hello") -> call_with_args -# len(items) -> call_with_args -# obj.process() -> method_call -# obj.get().save() -> chained_call -# func(name=value) -> call_with_kwargs -# MyClass() -> constructor_call -# items[0] -> subscript_access (not a call) - -# Filtering notes: -# - self.method() calls are typically filtered during processing -# - cls.method() calls are typically filtered during processing -# - super().method() calls may be handled specially diff --git a/codex-lens/src/codexlens/parsers/patterns/python/imports.yaml b/codex-lens/src/codexlens/parsers/patterns/python/imports.yaml deleted file mode 100644 index a1248790..00000000 --- a/codex-lens/src/codexlens/parsers/patterns/python/imports.yaml +++ /dev/null @@ -1,82 +0,0 @@ -# Python IMPORTS patterns for ast-grep -# Extracts import statements (import, from...import, as aliases) - -# Pattern metadata -id: python-imports -language: python -description: Extract import statements from Python code - -patterns: - # Simple import - # Matches: import os - - id: simple_import - pattern: "import $MODULE" - message: "Found simple import" - severity: hint - - # Import with alias - # Matches: import numpy as np - - id: import_with_alias - pattern: "import $MODULE as $ALIAS" - message: "Found import with alias" - severity: hint - - # Multiple imports - # Matches: import os, sys - - id: multiple_imports - pattern: "import $FIRST, $$$REST" - message: "Found multiple imports" - severity: hint - - # From import (single name) - # Matches: from os import path - - id: from_import_single - pattern: "from $MODULE import $NAME" - message: "Found from-import single" - severity: hint - - # From import with alias - # Matches: from collections import defaultdict as dd - - id: from_import_with_alias - pattern: "from $MODULE import $NAME as $ALIAS" - message: "Found from-import with alias" - severity: hint - - # From import multiple names - # Matches: from typing import List, Dict, Optional - - id: from_import_multiple - pattern: "from $MODULE import $FIRST, $$$REST" - message: "Found from-import multiple" - severity: hint - - # From import star - # Matches: from module import * - - id: from_import_star - pattern: "from $MODULE import *" - message: "Found star import" - severity: warning - - # Relative import - # Matches: from .module import func - - id: relative_import - pattern: "from .$$$MODULE import $NAMES" - message: "Found relative import" - severity: hint - -# Metavariables used: -# $MODULE - The module being imported -# $ALIAS - The alias for the import -# $NAME - The specific name being imported -# $FIRST - First item in a multi-item import -# $$$REST - Remaining items in a multi-item import -# $NAMES - Names being imported in from-import - -# Examples matched: -# import os -> simple_import -# import numpy as np -> import_with_alias -# import os, sys, pathlib -> multiple_imports -# from os import path -> from_import_single -# from typing import List, Dict, Set -> from_import_multiple -# from collections import defaultdict -> from_import_single -# from .helpers import utils -> relative_import -# from module import * -> from_import_star diff --git a/codex-lens/src/codexlens/parsers/patterns/python/inherits.yaml b/codex-lens/src/codexlens/parsers/patterns/python/inherits.yaml deleted file mode 100644 index d818ab25..00000000 --- a/codex-lens/src/codexlens/parsers/patterns/python/inherits.yaml +++ /dev/null @@ -1,42 +0,0 @@ -# Python INHERITS patterns for ast-grep -# Extracts class inheritance relationships (single and multiple inheritance) - -# Pattern metadata -id: python-inherits -language: python -description: Extract class inheritance relationships from Python code - -# Single inheritance pattern -# Matches: class Child(Parent): -patterns: - - id: single_inheritance - pattern: "class $CLASS_NAME($BASE) $$$BODY" - message: "Found single inheritance" - severity: hint - - # Multiple inheritance pattern - # Matches: class Child(Parent1, Parent2, Parent3): - - id: multiple_inheritance - pattern: "class $CLASS_NAME($BASE, $$$MORE_BASES) $$$BODY" - message: "Found multiple inheritance" - severity: hint - - # Generic inheritance with any number of bases - # Matches: class Child(...): with any number of parent classes - - id: class_with_bases - pattern: "class $NAME($$$BASES) $$$BODY" - message: "Found class with base classes" - severity: hint - -# Metavariables used: -# $CLASS_NAME - The name of the child class -# $BASE - First base class (for single inheritance) -# $BASES - All base classes combined -# $MORE_BASES - Additional base classes after the first (for multiple inheritance) -# $$$BODY - Class body (statements, can be multiple) - -# Examples matched: -# class Dog(Animal): -> single_inheritance -# class C(A, B): -> multiple_inheritance -# class D(BaseMixin, logging.Log) -> class_with_bases -# class E(A, B, C, D): -> multiple_inheritance diff --git a/codex-lens/src/codexlens/parsers/patterns/typescript/__init__.py b/codex-lens/src/codexlens/parsers/patterns/typescript/__init__.py deleted file mode 100644 index 0c0294c4..00000000 --- a/codex-lens/src/codexlens/parsers/patterns/typescript/__init__.py +++ /dev/null @@ -1,73 +0,0 @@ -"""TypeScript ast-grep patterns for relationship extraction. - -This module extends the JavaScript patterns with TypeScript-specific syntax -such as `import type` and `interface ... extends ...`. -""" - -from __future__ import annotations - -from typing import Dict, List - -from codexlens.parsers.patterns.javascript import ( - METAVARS, - RELATIONSHIP_PATTERNS as _JS_RELATIONSHIP_PATTERNS, - JAVASCRIPT_PATTERNS, -) - - -TYPESCRIPT_PATTERNS: Dict[str, str] = { - **JAVASCRIPT_PATTERNS, - # Type-only imports - "import_type_from_dq": "import type $$$IMPORTS from \"$MODULE\"", - "import_type_from_sq": "import type $$$IMPORTS from '$MODULE'", - "import_type_named_only_dq": "import type {$$$NAMES} from \"$MODULE\"", - "import_type_named_only_sq": "import type {$$$NAMES} from '$MODULE'", - "import_type_default_named_dq": "import type $DEFAULT, {$$$NAMES} from \"$MODULE\"", - "import_type_default_named_sq": "import type $DEFAULT, {$$$NAMES} from '$MODULE'", - # Interface inheritance: interface Foo extends Bar {} - "interface_extends": "interface $NAME extends $BASE $$$BODY", -} - - -RELATIONSHIP_PATTERNS: Dict[str, List[str]] = { - **_JS_RELATIONSHIP_PATTERNS, - "imports": [ - *_JS_RELATIONSHIP_PATTERNS.get("imports", []), - "import_type_from_dq", - "import_type_from_sq", - "import_type_named_only_dq", - "import_type_named_only_sq", - "import_type_default_named_dq", - "import_type_default_named_sq", - ], - "inheritance": [ - *_JS_RELATIONSHIP_PATTERNS.get("inheritance", []), - "interface_extends", - ], -} - - -def get_pattern(pattern_name: str) -> str: - if pattern_name not in TYPESCRIPT_PATTERNS: - raise KeyError( - f"Unknown TS pattern: {pattern_name}. Available: {list(TYPESCRIPT_PATTERNS.keys())}" - ) - return TYPESCRIPT_PATTERNS[pattern_name] - - -def get_patterns_for_relationship(rel_type: str) -> List[str]: - return RELATIONSHIP_PATTERNS.get(rel_type, []) - - -def get_metavar(name: str) -> str: - return METAVARS.get(name, name.upper()) - - -__all__ = [ - "TYPESCRIPT_PATTERNS", - "METAVARS", - "RELATIONSHIP_PATTERNS", - "get_pattern", - "get_patterns_for_relationship", - "get_metavar", -] diff --git a/codex-lens/src/codexlens/parsers/tokenizer.py b/codex-lens/src/codexlens/parsers/tokenizer.py deleted file mode 100644 index dcb12238..00000000 --- a/codex-lens/src/codexlens/parsers/tokenizer.py +++ /dev/null @@ -1,98 +0,0 @@ -"""Token counting utilities for CodexLens. - -Provides accurate token counting using tiktoken with character count fallback. -""" - -from __future__ import annotations - -from typing import Optional - -try: - import tiktoken - TIKTOKEN_AVAILABLE = True -except ImportError: - TIKTOKEN_AVAILABLE = False - - -class Tokenizer: - """Token counter with tiktoken primary and character count fallback.""" - - def __init__(self, encoding_name: str = "cl100k_base") -> None: - """Initialize tokenizer. - - Args: - encoding_name: Tiktoken encoding name (default: cl100k_base for GPT-4) - """ - self._encoding: Optional[object] = None - self._encoding_name = encoding_name - - if TIKTOKEN_AVAILABLE: - try: - self._encoding = tiktoken.get_encoding(encoding_name) - except Exception: - # Fallback to character counting if encoding fails - self._encoding = None - - def count_tokens(self, text: str) -> int: - """Count tokens in text. - - Uses tiktoken if available, otherwise falls back to character count / 4. - - Args: - text: Text to count tokens for - - Returns: - Estimated token count - """ - if not text: - return 0 - - if self._encoding is not None: - try: - return len(self._encoding.encode(text)) # type: ignore[attr-defined] - except Exception: - # Fall through to character count fallback - pass - - # Fallback: rough estimate using character count - # Average of ~4 characters per token for English text - return max(1, len(text) // 4) - - def is_using_tiktoken(self) -> bool: - """Check if tiktoken is being used. - - Returns: - True if tiktoken is available and initialized - """ - return self._encoding is not None - - -# Global default tokenizer instance -_default_tokenizer: Optional[Tokenizer] = None - - -def get_default_tokenizer() -> Tokenizer: - """Get the global default tokenizer instance. - - Returns: - Shared Tokenizer instance - """ - global _default_tokenizer - if _default_tokenizer is None: - _default_tokenizer = Tokenizer() - return _default_tokenizer - - -def count_tokens(text: str, tokenizer: Optional[Tokenizer] = None) -> int: - """Count tokens in text using default or provided tokenizer. - - Args: - text: Text to count tokens for - tokenizer: Optional tokenizer instance (uses default if None) - - Returns: - Estimated token count - """ - if tokenizer is None: - tokenizer = get_default_tokenizer() - return tokenizer.count_tokens(text) diff --git a/codex-lens/src/codexlens/parsers/treesitter_parser.py b/codex-lens/src/codexlens/parsers/treesitter_parser.py deleted file mode 100644 index 73638fe8..00000000 --- a/codex-lens/src/codexlens/parsers/treesitter_parser.py +++ /dev/null @@ -1,935 +0,0 @@ -"""Tree-sitter based parser for CodexLens. - -Provides precise AST-level parsing via tree-sitter. - -Note: This module does not provide a regex fallback inside `TreeSitterSymbolParser`. -If tree-sitter (or a language binding) is unavailable, `parse()`/`parse_symbols()` -return `None`; callers should use a regex-based fallback such as -`codexlens.parsers.factory.SimpleRegexParser`. -""" - -from __future__ import annotations - -from pathlib import Path -from typing import Dict, List, Optional, TYPE_CHECKING - -try: - from tree_sitter import Language as TreeSitterLanguage - from tree_sitter import Node as TreeSitterNode - from tree_sitter import Parser as TreeSitterParser - TREE_SITTER_AVAILABLE = True -except ImportError: - TreeSitterLanguage = None # type: ignore[assignment] - TreeSitterNode = None # type: ignore[assignment] - TreeSitterParser = None # type: ignore[assignment] - TREE_SITTER_AVAILABLE = False - -from codexlens.entities import CodeRelationship, IndexedFile, RelationshipType, Symbol -from codexlens.parsers.tokenizer import get_default_tokenizer - -if TYPE_CHECKING: - from codexlens.config import Config - - -class TreeSitterSymbolParser: - """Parser using tree-sitter for AST-level symbol extraction. - - Supports optional ast-grep integration for relationship extraction - (Python/JavaScript/TypeScript) when config.use_astgrep is True and - ast-grep-py is available. - """ - - def __init__( - self, - language_id: str, - path: Optional[Path] = None, - config: Optional["Config"] = None, - ) -> None: - """Initialize tree-sitter parser for a language. - - Args: - language_id: Language identifier (python, javascript, typescript, etc.) - path: Optional file path for language variant detection (e.g., .tsx) - config: Optional Config instance for parser feature toggles - """ - self.language_id = language_id - self.path = path - self._config = config - self._parser: Optional[object] = None - self._language: Optional[TreeSitterLanguage] = None - self._tokenizer = get_default_tokenizer() - self._astgrep_processor = None - - if TREE_SITTER_AVAILABLE: - self._initialize_parser() - - # Initialize ast-grep processor for Python if config enables it - if self._should_use_astgrep(): - self._initialize_astgrep_processor() - - def _initialize_parser(self) -> None: - """Initialize tree-sitter parser and language.""" - if TreeSitterParser is None or TreeSitterLanguage is None: - return - - try: - # Load language grammar - if self.language_id == "python": - import tree_sitter_python - self._language = TreeSitterLanguage(tree_sitter_python.language()) - elif self.language_id == "javascript": - import tree_sitter_javascript - self._language = TreeSitterLanguage(tree_sitter_javascript.language()) - elif self.language_id == "typescript": - import tree_sitter_typescript - # Detect TSX files by extension - if self.path is not None and self.path.suffix.lower() == ".tsx": - self._language = TreeSitterLanguage(tree_sitter_typescript.language_tsx()) - else: - self._language = TreeSitterLanguage(tree_sitter_typescript.language_typescript()) - else: - return - - # Create parser - self._parser = TreeSitterParser() - if hasattr(self._parser, "set_language"): - self._parser.set_language(self._language) # type: ignore[attr-defined] - else: - self._parser.language = self._language # type: ignore[assignment] - - except Exception: - # Gracefully handle missing language bindings - self._parser = None - self._language = None - - def _should_use_astgrep(self) -> bool: - """Check if ast-grep should be used for relationship extraction. - - Returns: - True if config.use_astgrep is True and language is supported - """ - if self._config is None: - return False - if not getattr(self._config, "use_astgrep", False): - return False - return self.language_id in {"python", "javascript", "typescript"} - - def _initialize_astgrep_processor(self) -> None: - """Initialize ast-grep processor for relationship extraction.""" - try: - from codexlens.parsers.astgrep_processor import ( - AstGrepPythonProcessor, - is_astgrep_processor_available, - ) - from codexlens.parsers.astgrep_js_ts_processor import ( - AstGrepJavaScriptProcessor, - AstGrepTypeScriptProcessor, - ) - - if is_astgrep_processor_available(): - if self.language_id == "python": - self._astgrep_processor = AstGrepPythonProcessor(self.path) - elif self.language_id == "javascript": - self._astgrep_processor = AstGrepJavaScriptProcessor(self.path) - elif self.language_id == "typescript": - self._astgrep_processor = AstGrepTypeScriptProcessor(self.path) - except ImportError: - self._astgrep_processor = None - - def is_available(self) -> bool: - """Check if tree-sitter parser is available. - - Returns: - True if parser is initialized and ready - """ - return self._parser is not None and self._language is not None - - def _parse_tree(self, text: str) -> Optional[tuple[bytes, TreeSitterNode]]: - if not self.is_available() or self._parser is None: - return None - - try: - source_bytes = text.encode("utf8") - tree = self._parser.parse(source_bytes) # type: ignore[attr-defined] - return source_bytes, tree.root_node - except Exception: - return None - - def parse_symbols(self, text: str) -> Optional[List[Symbol]]: - """Parse source code and extract symbols without creating IndexedFile. - - Args: - text: Source code text - - Returns: - List of symbols if parsing succeeds, None if tree-sitter unavailable - """ - parsed = self._parse_tree(text) - if parsed is None: - return None - - source_bytes, root = parsed - try: - return self._extract_symbols(source_bytes, root) - except Exception: - # Gracefully handle extraction errors - return None - - def parse(self, text: str, path: Path) -> Optional[IndexedFile]: - """Parse source code and extract symbols. - - Args: - text: Source code text - path: File path - - Returns: - IndexedFile if parsing succeeds, None if tree-sitter unavailable - """ - parsed = self._parse_tree(text) - if parsed is None: - return None - - source_bytes, root = parsed - try: - symbols = self._extract_symbols(source_bytes, root) - # Pass source_code for ast-grep integration - relationships = self._extract_relationships( - source_bytes, root, path, source_code=text - ) - - return IndexedFile( - path=str(path.resolve()), - language=self.language_id, - symbols=symbols, - chunks=[], - relationships=relationships, - ) - except Exception: - # Gracefully handle parsing errors - return None - - def _extract_symbols(self, source_bytes: bytes, root: TreeSitterNode) -> List[Symbol]: - """Extract symbols from AST. - - Args: - source_bytes: Source code as bytes - root: Root AST node - - Returns: - List of extracted symbols - """ - if self.language_id == "python": - return self._extract_python_symbols(source_bytes, root) - elif self.language_id in {"javascript", "typescript"}: - return self._extract_js_ts_symbols(source_bytes, root) - else: - return [] - - def _extract_relationships( - self, - source_bytes: bytes, - root: TreeSitterNode, - path: Path, - source_code: Optional[str] = None, - ) -> List[CodeRelationship]: - """Extract relationships, optionally using ast-grep. - - When config.use_astgrep is True and an ast-grep processor is available, - uses ast-grep for relationship extraction. Otherwise, uses tree-sitter. - - Args: - source_bytes: Source code as bytes - root: Root AST node from tree-sitter - path: File path - source_code: Optional source code string (required for ast-grep) - - Returns: - List of extracted relationships - """ - # Try ast-grep first if configured and available for this language. - if self._astgrep_processor is not None and source_code is not None: - try: - astgrep_rels = self._extract_relationships_astgrep(source_code, path) - if astgrep_rels is not None: - return astgrep_rels - except Exception: - # Fall back to tree-sitter on ast-grep failure - pass - - if self.language_id == "python": - return self._extract_python_relationships(source_bytes, root, path) - if self.language_id in {"javascript", "typescript"}: - return self._extract_js_ts_relationships(source_bytes, root, path) - return [] - - def _extract_relationships_astgrep( - self, - source_code: str, - path: Path, - ) -> Optional[List[CodeRelationship]]: - """Extract relationships using ast-grep processor. - - Args: - source_code: Source code text - path: File path - - Returns: - List of relationships, or None if ast-grep unavailable - """ - if self._astgrep_processor is None: - return None - - if not self._astgrep_processor.is_available(): - return None - - try: - indexed = self._astgrep_processor.parse(source_code, path) - if indexed is not None: - return indexed.relationships - except Exception: - pass - - return None - - def _extract_python_relationships( - self, - source_bytes: bytes, - root: TreeSitterNode, - path: Path, - ) -> List[CodeRelationship]: - source_file = str(path.resolve()) - relationships: List[CodeRelationship] = [] - - # Use a synthetic module scope so module-level imports/calls can be recorded - # (useful for static global graph persistence). - scope_stack: List[str] = [""] - alias_stack: List[Dict[str, str]] = [{}] - - def record_import(target_symbol: str, source_line: int) -> None: - if not target_symbol.strip() or not scope_stack: - return - relationships.append( - CodeRelationship( - source_symbol=scope_stack[-1], - target_symbol=target_symbol, - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=source_line, - ) - ) - - def record_call(target_symbol: str, source_line: int) -> None: - if not target_symbol.strip() or not scope_stack: - return - base = target_symbol.split(".", 1)[0] - if base in {"self", "cls"}: - return - relationships.append( - CodeRelationship( - source_symbol=scope_stack[-1], - target_symbol=target_symbol, - relationship_type=RelationshipType.CALL, - source_file=source_file, - target_file=None, - source_line=source_line, - ) - ) - - def record_inherits(target_symbol: str, source_line: int) -> None: - if not target_symbol.strip() or not scope_stack: - return - relationships.append( - CodeRelationship( - source_symbol=scope_stack[-1], - target_symbol=target_symbol, - relationship_type=RelationshipType.INHERITS, - source_file=source_file, - target_file=None, - source_line=source_line, - ) - ) - - def visit(node: TreeSitterNode) -> None: - pushed_scope = False - pushed_aliases = False - - if node.type in {"class_definition", "function_definition", "async_function_definition"}: - name_node = node.child_by_field_name("name") - if name_node is not None: - scope_name = self._node_text(source_bytes, name_node).strip() - if scope_name: - scope_stack.append(scope_name) - pushed_scope = True - alias_stack.append(dict(alias_stack[-1])) - pushed_aliases = True - - if node.type == "class_definition" and pushed_scope: - superclasses = node.child_by_field_name("superclasses") - if superclasses is not None: - for child in superclasses.children: - dotted = self._python_expression_to_dotted(source_bytes, child) - if not dotted: - continue - resolved = self._resolve_alias_dotted(dotted, alias_stack[-1]) - record_inherits(resolved, self._node_start_line(node)) - - if node.type in {"import_statement", "import_from_statement"}: - updates, imported_targets = self._python_import_aliases_and_targets(source_bytes, node) - if updates: - alias_stack[-1].update(updates) - for target_symbol in imported_targets: - record_import(target_symbol, self._node_start_line(node)) - - if node.type == "call": - fn_node = node.child_by_field_name("function") - if fn_node is not None: - dotted = self._python_expression_to_dotted(source_bytes, fn_node) - if dotted: - resolved = self._resolve_alias_dotted(dotted, alias_stack[-1]) - record_call(resolved, self._node_start_line(node)) - - for child in node.children: - visit(child) - - if pushed_aliases: - alias_stack.pop() - if pushed_scope: - scope_stack.pop() - - visit(root) - return relationships - - def _extract_js_ts_relationships( - self, - source_bytes: bytes, - root: TreeSitterNode, - path: Path, - ) -> List[CodeRelationship]: - source_file = str(path.resolve()) - relationships: List[CodeRelationship] = [] - - # Use a synthetic module scope so module-level imports/calls can be recorded - # (useful for static global graph persistence). - scope_stack: List[str] = [""] - alias_stack: List[Dict[str, str]] = [{}] - - def record_import(target_symbol: str, source_line: int) -> None: - if not target_symbol.strip() or not scope_stack: - return - relationships.append( - CodeRelationship( - source_symbol=scope_stack[-1], - target_symbol=target_symbol, - relationship_type=RelationshipType.IMPORTS, - source_file=source_file, - target_file=None, - source_line=source_line, - ) - ) - - def record_call(target_symbol: str, source_line: int) -> None: - if not target_symbol.strip() or not scope_stack: - return - base = target_symbol.split(".", 1)[0] - if base in {"this", "super"}: - return - relationships.append( - CodeRelationship( - source_symbol=scope_stack[-1], - target_symbol=target_symbol, - relationship_type=RelationshipType.CALL, - source_file=source_file, - target_file=None, - source_line=source_line, - ) - ) - - def record_inherits(target_symbol: str, source_line: int) -> None: - if not target_symbol.strip() or not scope_stack: - return - relationships.append( - CodeRelationship( - source_symbol=scope_stack[-1], - target_symbol=target_symbol, - relationship_type=RelationshipType.INHERITS, - source_file=source_file, - target_file=None, - source_line=source_line, - ) - ) - - def visit(node: TreeSitterNode) -> None: - pushed_scope = False - pushed_aliases = False - - if node.type in {"function_declaration", "generator_function_declaration"}: - name_node = node.child_by_field_name("name") - if name_node is not None: - scope_name = self._node_text(source_bytes, name_node).strip() - if scope_name: - scope_stack.append(scope_name) - pushed_scope = True - alias_stack.append(dict(alias_stack[-1])) - pushed_aliases = True - - if node.type in {"class_declaration", "class"}: - name_node = node.child_by_field_name("name") - if name_node is not None: - scope_name = self._node_text(source_bytes, name_node).strip() - if scope_name: - scope_stack.append(scope_name) - pushed_scope = True - alias_stack.append(dict(alias_stack[-1])) - pushed_aliases = True - - if pushed_scope: - superclass = node.child_by_field_name("superclass") - if superclass is not None: - dotted = self._js_expression_to_dotted(source_bytes, superclass) - if dotted: - resolved = self._resolve_alias_dotted(dotted, alias_stack[-1]) - record_inherits(resolved, self._node_start_line(node)) - - if node.type == "variable_declarator": - name_node = node.child_by_field_name("name") - value_node = node.child_by_field_name("value") - if ( - name_node is not None - and value_node is not None - and name_node.type in {"identifier", "property_identifier"} - and value_node.type == "arrow_function" - ): - scope_name = self._node_text(source_bytes, name_node).strip() - if scope_name: - scope_stack.append(scope_name) - pushed_scope = True - alias_stack.append(dict(alias_stack[-1])) - pushed_aliases = True - - if node.type == "method_definition" and self._has_class_ancestor(node): - name_node = node.child_by_field_name("name") - if name_node is not None: - scope_name = self._node_text(source_bytes, name_node).strip() - if scope_name and scope_name != "constructor": - scope_stack.append(scope_name) - pushed_scope = True - alias_stack.append(dict(alias_stack[-1])) - pushed_aliases = True - - if node.type in {"import_declaration", "import_statement"}: - updates, imported_targets = self._js_import_aliases_and_targets(source_bytes, node) - if updates: - alias_stack[-1].update(updates) - for target_symbol in imported_targets: - record_import(target_symbol, self._node_start_line(node)) - - # Best-effort support for CommonJS require() imports: - # const fs = require("fs") - if node.type == "variable_declarator": - name_node = node.child_by_field_name("name") - value_node = node.child_by_field_name("value") - if ( - name_node is not None - and value_node is not None - and name_node.type == "identifier" - and value_node.type == "call_expression" - ): - callee = value_node.child_by_field_name("function") - args = value_node.child_by_field_name("arguments") - if ( - callee is not None - and self._node_text(source_bytes, callee).strip() == "require" - and args is not None - ): - module_name = self._js_first_string_argument(source_bytes, args) - if module_name: - alias_stack[-1][self._node_text(source_bytes, name_node).strip()] = module_name - record_import(module_name, self._node_start_line(node)) - - if node.type == "call_expression": - fn_node = node.child_by_field_name("function") - if fn_node is not None: - dotted = self._js_expression_to_dotted(source_bytes, fn_node) - if dotted: - resolved = self._resolve_alias_dotted(dotted, alias_stack[-1]) - record_call(resolved, self._node_start_line(node)) - - for child in node.children: - visit(child) - - if pushed_aliases: - alias_stack.pop() - if pushed_scope: - scope_stack.pop() - - visit(root) - return relationships - - def _node_start_line(self, node: TreeSitterNode) -> int: - return node.start_point[0] + 1 - - def _resolve_alias_dotted(self, dotted: str, aliases: Dict[str, str]) -> str: - dotted = (dotted or "").strip() - if not dotted: - return "" - - base, sep, rest = dotted.partition(".") - resolved_base = aliases.get(base, base) - if not rest: - return resolved_base - if resolved_base and rest: - return f"{resolved_base}.{rest}" - return resolved_base - - def _python_expression_to_dotted(self, source_bytes: bytes, node: TreeSitterNode) -> str: - if node.type in {"identifier", "dotted_name"}: - return self._node_text(source_bytes, node).strip() - if node.type == "attribute": - obj = node.child_by_field_name("object") - attr = node.child_by_field_name("attribute") - obj_text = self._python_expression_to_dotted(source_bytes, obj) if obj is not None else "" - attr_text = self._node_text(source_bytes, attr).strip() if attr is not None else "" - if obj_text and attr_text: - return f"{obj_text}.{attr_text}" - return obj_text or attr_text - return "" - - def _python_import_aliases_and_targets( - self, - source_bytes: bytes, - node: TreeSitterNode, - ) -> tuple[Dict[str, str], List[str]]: - aliases: Dict[str, str] = {} - targets: List[str] = [] - - if node.type == "import_statement": - for i, child in enumerate(node.children): - if child.type == "aliased_import": - name_node = child.child_by_field_name("name") - alias_node = child.child_by_field_name("alias") - if name_node is None: - continue - module_name = self._node_text(source_bytes, name_node).strip() - if not module_name: - continue - bound_name = ( - self._node_text(source_bytes, alias_node).strip() - if alias_node is not None - else module_name.split(".", 1)[0] - ) - if bound_name: - aliases[bound_name] = module_name - targets.append(module_name) - elif child.type == "dotted_name": - module_name = self._node_text(source_bytes, child).strip() - if not module_name: - continue - bound_name = module_name.split(".", 1)[0] - if bound_name: - aliases[bound_name] = bound_name - targets.append(module_name) - - if node.type == "import_from_statement": - module_name = "" - module_node = node.child_by_field_name("module_name") - if module_node is None: - for child in node.children: - if child.type == "dotted_name": - module_node = child - break - if module_node is not None: - module_name = self._node_text(source_bytes, module_node).strip() - - for i, child in enumerate(node.children): - if child.type == "aliased_import": - name_node = child.child_by_field_name("name") - alias_node = child.child_by_field_name("alias") - if name_node is None: - continue - imported_name = self._node_text(source_bytes, name_node).strip() - if not imported_name or imported_name == "*": - continue - target = f"{module_name}.{imported_name}" if module_name else imported_name - bound_name = ( - self._node_text(source_bytes, alias_node).strip() - if alias_node is not None - else imported_name - ) - if bound_name: - aliases[bound_name] = target - targets.append(target) - elif child.type == "dotted_name" and node.field_name_for_child(i) == "name": - # tree-sitter-python represents `from X import A, B, C` as - # multiple dotted_name nodes (field: "name"). - imported_name = self._node_text(source_bytes, child).strip() - if not imported_name: - continue - target = ( - f"{module_name}.{imported_name}" if module_name else imported_name - ) - aliases[imported_name] = target - targets.append(target) - elif child.type == "identifier" and node.field_name_for_child(i) == "name": - imported_name = self._node_text(source_bytes, child).strip() - if not imported_name or imported_name in {"from", "import", "*"}: - continue - target = f"{module_name}.{imported_name}" if module_name else imported_name - aliases[imported_name] = target - targets.append(target) - - return aliases, targets - - def _js_expression_to_dotted(self, source_bytes: bytes, node: TreeSitterNode) -> str: - if node.type in {"this", "super"}: - return node.type - if node.type in {"identifier", "property_identifier"}: - return self._node_text(source_bytes, node).strip() - if node.type == "member_expression": - obj = node.child_by_field_name("object") - prop = node.child_by_field_name("property") - obj_text = self._js_expression_to_dotted(source_bytes, obj) if obj is not None else "" - prop_text = self._js_expression_to_dotted(source_bytes, prop) if prop is not None else "" - if obj_text and prop_text: - return f"{obj_text}.{prop_text}" - return obj_text or prop_text - return "" - - def _js_import_aliases_and_targets( - self, - source_bytes: bytes, - node: TreeSitterNode, - ) -> tuple[Dict[str, str], List[str]]: - aliases: Dict[str, str] = {} - targets: List[str] = [] - - module_name = "" - source_node = node.child_by_field_name("source") - if source_node is not None: - module_name = self._node_text(source_bytes, source_node).strip().strip("\"'").strip() - if module_name: - targets.append(module_name) - - for child in node.children: - if child.type == "import_clause": - for clause_child in child.children: - if clause_child.type == "identifier": - # Default import: import React from "react" - local = self._node_text(source_bytes, clause_child).strip() - if local and module_name: - aliases[local] = module_name - if clause_child.type == "namespace_import": - # Namespace import: import * as fs from "fs" - name_node = clause_child.child_by_field_name("name") - if name_node is not None and module_name: - local = self._node_text(source_bytes, name_node).strip() - if local: - aliases[local] = module_name - if clause_child.type == "named_imports": - for spec in clause_child.children: - if spec.type != "import_specifier": - continue - name_node = spec.child_by_field_name("name") - alias_node = spec.child_by_field_name("alias") - if name_node is None: - continue - imported = self._node_text(source_bytes, name_node).strip() - if not imported: - continue - local = ( - self._node_text(source_bytes, alias_node).strip() - if alias_node is not None - else imported - ) - if local and module_name: - aliases[local] = f"{module_name}.{imported}" - targets.append(f"{module_name}.{imported}") - - return aliases, targets - - def _js_first_string_argument(self, source_bytes: bytes, args_node: TreeSitterNode) -> str: - for child in args_node.children: - if child.type == "string": - return self._node_text(source_bytes, child).strip().strip("\"'").strip() - return "" - - def _extract_python_symbols(self, source_bytes: bytes, root: TreeSitterNode) -> List[Symbol]: - """Extract Python symbols from AST. - - Args: - source_bytes: Source code as bytes - root: Root AST node - - Returns: - List of Python symbols (classes, functions, methods) - """ - symbols: List[Symbol] = [] - - for node in self._iter_nodes(root): - if node.type == "class_definition": - name_node = node.child_by_field_name("name") - if name_node is None: - continue - symbols.append(Symbol( - name=self._node_text(source_bytes, name_node), - kind="class", - range=self._node_range(node), - )) - elif node.type in {"function_definition", "async_function_definition"}: - name_node = node.child_by_field_name("name") - if name_node is None: - continue - symbols.append(Symbol( - name=self._node_text(source_bytes, name_node), - kind=self._python_function_kind(node), - range=self._node_range(node), - )) - - return symbols - - def _extract_js_ts_symbols(self, source_bytes: bytes, root: TreeSitterNode) -> List[Symbol]: - """Extract JavaScript/TypeScript symbols from AST. - - Args: - source_bytes: Source code as bytes - root: Root AST node - - Returns: - List of JS/TS symbols (classes, functions, methods) - """ - symbols: List[Symbol] = [] - - for node in self._iter_nodes(root): - if node.type in {"class_declaration", "class"}: - name_node = node.child_by_field_name("name") - if name_node is None: - continue - symbols.append(Symbol( - name=self._node_text(source_bytes, name_node), - kind="class", - range=self._node_range(node), - )) - elif node.type in {"function_declaration", "generator_function_declaration"}: - name_node = node.child_by_field_name("name") - if name_node is None: - continue - symbols.append(Symbol( - name=self._node_text(source_bytes, name_node), - kind="function", - range=self._node_range(node), - )) - elif node.type == "variable_declarator": - name_node = node.child_by_field_name("name") - value_node = node.child_by_field_name("value") - if ( - name_node is None - or value_node is None - or name_node.type not in {"identifier", "property_identifier"} - or value_node.type != "arrow_function" - ): - continue - symbols.append(Symbol( - name=self._node_text(source_bytes, name_node), - kind="function", - range=self._node_range(node), - )) - elif node.type == "method_definition" and self._has_class_ancestor(node): - name_node = node.child_by_field_name("name") - if name_node is None: - continue - name = self._node_text(source_bytes, name_node) - if name == "constructor": - continue - symbols.append(Symbol( - name=name, - kind="method", - range=self._node_range(node), - )) - - return symbols - - def _python_function_kind(self, node: TreeSitterNode) -> str: - """Determine if Python function is a method or standalone function. - - Args: - node: Function definition node - - Returns: - 'method' if inside a class, 'function' otherwise - """ - parent = node.parent - while parent is not None: - if parent.type in {"function_definition", "async_function_definition"}: - return "function" - if parent.type == "class_definition": - return "method" - parent = parent.parent - return "function" - - def _has_class_ancestor(self, node: TreeSitterNode) -> bool: - """Check if node has a class ancestor. - - Args: - node: AST node to check - - Returns: - True if node is inside a class - """ - parent = node.parent - while parent is not None: - if parent.type in {"class_declaration", "class"}: - return True - parent = parent.parent - return False - - def _iter_nodes(self, root: TreeSitterNode): - """Iterate over all nodes in AST. - - Args: - root: Root node to start iteration - - Yields: - AST nodes in depth-first order - """ - stack = [root] - while stack: - node = stack.pop() - yield node - for child in reversed(node.children): - stack.append(child) - - def _node_text(self, source_bytes: bytes, node: TreeSitterNode) -> str: - """Extract text for a node. - - Args: - source_bytes: Source code as bytes - node: AST node - - Returns: - Text content of node - """ - return source_bytes[node.start_byte:node.end_byte].decode("utf8") - - def _node_range(self, node: TreeSitterNode) -> tuple[int, int]: - """Get line range for a node. - - Args: - node: AST node - - Returns: - (start_line, end_line) tuple, 1-based inclusive - """ - start_line = node.start_point[0] + 1 - end_line = node.end_point[0] + 1 - return (start_line, max(start_line, end_line)) - - def count_tokens(self, text: str) -> int: - """Count tokens in text. - - Args: - text: Text to count tokens for - - Returns: - Token count - """ - return self._tokenizer.count_tokens(text) diff --git a/codex-lens/src/codexlens/search/__init__.py b/codex-lens/src/codexlens/search/__init__.py deleted file mode 100644 index e8749930..00000000 --- a/codex-lens/src/codexlens/search/__init__.py +++ /dev/null @@ -1,55 +0,0 @@ -from .chain_search import ( - ChainSearchEngine, - SearchOptions, - SearchStats, - ChainSearchResult, - quick_search, -) -from .global_graph_expander import GlobalGraphExpander - -# Clustering availability flag (lazy import pattern) -CLUSTERING_AVAILABLE = False -_clustering_import_error: str | None = None - -try: - from .clustering import CLUSTERING_AVAILABLE as _clustering_flag - from .clustering import check_clustering_available - CLUSTERING_AVAILABLE = _clustering_flag -except ImportError as e: - _clustering_import_error = str(e) - - def check_clustering_available() -> tuple[bool, str | None]: - """Fallback when clustering module not loadable.""" - return False, _clustering_import_error - - -# Clustering module exports (conditional) -try: - from .clustering import ( - BaseClusteringStrategy, - ClusteringConfig, - ClusteringStrategyFactory, - get_strategy, - ) - _clustering_exports = [ - "BaseClusteringStrategy", - "ClusteringConfig", - "ClusteringStrategyFactory", - "get_strategy", - ] -except ImportError: - _clustering_exports = [] - - -__all__ = [ - "ChainSearchEngine", - "SearchOptions", - "SearchStats", - "ChainSearchResult", - "quick_search", - "GlobalGraphExpander", - # Clustering - "CLUSTERING_AVAILABLE", - "check_clustering_available", - *_clustering_exports, -] diff --git a/codex-lens/src/codexlens/search/association_tree/QUICK_START.md b/codex-lens/src/codexlens/search/association_tree/QUICK_START.md deleted file mode 100644 index 1874c1b2..00000000 --- a/codex-lens/src/codexlens/search/association_tree/QUICK_START.md +++ /dev/null @@ -1,257 +0,0 @@ -# Association Tree Quick Start - -## Installation - -No additional dependencies needed - uses existing CodexLens LSP infrastructure. - -## Basic Usage - -### 1. Import Components - -```python -from codexlens.lsp.standalone_manager import StandaloneLspManager -from codexlens.search.association_tree import ( - AssociationTreeBuilder, - ResultDeduplicator, -) -``` - -### 2. Build a Tree - -```python -import asyncio - -async def build_tree_example(): - # Initialize LSP manager - async with StandaloneLspManager(workspace_root="/path/to/project") as lsp: - # Create builder - builder = AssociationTreeBuilder(lsp, timeout=5.0) - - # Build tree from seed location - tree = await builder.build_tree( - seed_file_path="src/main.py", - seed_line=42, # 1-based line number - seed_character=1, # 1-based character position - max_depth=5, # Maximum recursion depth - expand_callers=True, # Find who calls this - expand_callees=True, # Find what this calls - ) - - return tree - -tree = asyncio.run(build_tree_example()) -print(f"Found {len(tree.all_nodes)} unique nodes") -``` - -### 3. Deduplicate and Score - -```python -# Create deduplicator -deduplicator = ResultDeduplicator( - depth_weight=0.4, # Weight for depth score (0-1) - frequency_weight=0.3, # Weight for frequency score (0-1) - kind_weight=0.3, # Weight for symbol kind score (0-1) -) - -# Extract unique nodes -unique_nodes = deduplicator.deduplicate(tree, max_results=20) - -# Print results -for node in unique_nodes: - print(f"{node.name} @ {node.file_path}:{node.range.start_line}") - print(f" Score: {node.score:.2f}, Depth: {node.min_depth}, Occurs: {node.occurrences}") -``` - -### 4. Filter Results - -```python -# Filter by symbol kind -functions = deduplicator.filter_by_kind(unique_nodes, ["function", "method"]) - -# Filter by file pattern -core_modules = deduplicator.filter_by_file(unique_nodes, ["src/core/"]) - -# Convert to JSON -json_data = deduplicator.to_dict_list(unique_nodes) -``` - -## Common Patterns - -### Pattern 1: Find All Callers - -```python -tree = await builder.build_tree( - seed_file_path=target_file, - seed_line=target_line, - max_depth=3, - expand_callers=True, # Only expand callers - expand_callees=False, # Don't expand callees -) -``` - -### Pattern 2: Find Call Chain - -```python -tree = await builder.build_tree( - seed_file_path=entry_point, - seed_line=main_line, - max_depth=10, - expand_callers=False, # Don't expand callers - expand_callees=True, # Only expand callees (call chain) -) -``` - -### Pattern 3: Full Relationship Map - -```python -tree = await builder.build_tree( - seed_file_path=target_file, - seed_line=target_line, - max_depth=5, - expand_callers=True, # Expand both directions - expand_callees=True, -) -``` - -## Configuration Tips - -### Max Depth Guidelines - -- **Depth 1-2**: Direct callers/callees only (fast, focused) -- **Depth 3-5**: Good balance of coverage and performance (recommended) -- **Depth 6-10**: Deep exploration (slower, may hit cycles) - -### Timeout Settings - -```python -builder = AssociationTreeBuilder( - lsp, - timeout=5.0, # 5 seconds per LSP request -) - -# For slower language servers -builder = AssociationTreeBuilder(lsp, timeout=10.0) -``` - -### Score Weight Tuning - -```python -# Emphasize proximity to seed -deduplicator = ResultDeduplicator( - depth_weight=0.7, # High weight for depth - frequency_weight=0.2, - kind_weight=0.1, -) - -# Emphasize frequently-called functions -deduplicator = ResultDeduplicator( - depth_weight=0.2, - frequency_weight=0.7, # High weight for frequency - kind_weight=0.1, -) -``` - -## Error Handling - -```python -try: - tree = await builder.build_tree(...) - - if not tree.all_nodes: - print("No call hierarchy found - LSP may not support this file type") - -except asyncio.TimeoutError: - print("LSP request timed out - try increasing timeout") - -except Exception as e: - print(f"Error building tree: {e}") -``` - -## Performance Optimization - -### 1. Limit Depth - -```python -# Fast: max_depth=3 -tree = await builder.build_tree(..., max_depth=3) -``` - -### 2. Filter Early - -```python -# Get all nodes -unique_nodes = deduplicator.deduplicate(tree) - -# Filter to relevant kinds immediately -functions = deduplicator.filter_by_kind(unique_nodes, ["function", "method"]) -``` - -### 3. Use Timeouts - -```python -# Set aggressive timeouts for fast iteration -builder = AssociationTreeBuilder(lsp, timeout=3.0) -``` - -## Common Issues - -### Issue: Empty Tree Returned - -**Causes**: -- File not supported by LSP server -- No call hierarchy at that position -- Position is not on a function/method - -**Solutions**: -- Verify LSP server supports the language -- Check that position is on a function definition -- Try different seed locations - -### Issue: Timeout Errors - -**Causes**: -- LSP server slow or overloaded -- Network/connection issues -- Max depth too high - -**Solutions**: -- Increase timeout value -- Reduce max_depth -- Check LSP server health - -### Issue: Cycle Detected - -**Behavior**: Cycles are automatically detected and marked - -**Example**: -```python -for node in tree.node_list: - if node.is_cycle: - print(f"Cycle detected at {node.item.name}") -``` - -## Testing - -Run the test suite: - -```bash -# All tests -pytest tests/test_association_tree.py -v - -# Specific test -pytest tests/test_association_tree.py::test_simple_tree_building -v -``` - -## Demo Script - -Run the demo: - -```bash -python examples/association_tree_demo.py -``` - -## Further Reading - -- [Full Documentation](README.md) -- [Implementation Summary](../../ASSOCIATION_TREE_IMPLEMENTATION.md) -- [LSP Manager Documentation](../../lsp/standalone_manager.py) diff --git a/codex-lens/src/codexlens/search/association_tree/README.md b/codex-lens/src/codexlens/search/association_tree/README.md deleted file mode 100644 index b9e180f6..00000000 --- a/codex-lens/src/codexlens/search/association_tree/README.md +++ /dev/null @@ -1,188 +0,0 @@ -# Association Tree Module - -LSP-based code relationship discovery using call hierarchy. - -## Overview - -This module provides components for building and analyzing call relationship trees using Language Server Protocol (LSP) call hierarchy capabilities. It consists of three main components: - -1. **Data Structures** (`data_structures.py`) - Core data classes -2. **Association Tree Builder** (`builder.py`) - Tree construction via LSP -3. **Result Deduplicator** (`deduplicator.py`) - Node extraction and scoring - -## Components - -### 1. Data Structures - -**TreeNode**: Represents a single node in the call tree. -- Contains LSP CallHierarchyItem -- Tracks depth, parents, children -- Detects and marks cycles - -**CallTree**: Complete tree structure with roots and edges. -- Stores all discovered nodes -- Tracks edges (call relationships) -- Provides lookup by node_id - -**UniqueNode**: Deduplicated code symbol with metadata. -- Aggregates multiple occurrences -- Tracks minimum depth -- Contains relevance score - -### 2. AssociationTreeBuilder - -Builds call trees using LSP call hierarchy: - -**Strategy**: -- Depth-first recursive expansion -- Supports expanding callers (incoming calls) and callees (outgoing calls) -- Detects and marks circular references -- Respects max_depth limit - -**Key Features**: -- Async/await for concurrent LSP requests -- Timeout handling (5s per node) -- Graceful error handling -- Cycle detection via visited set - -### 3. ResultDeduplicator - -Extracts unique nodes from trees and assigns scores: - -**Scoring Factors**: -- **Depth** (40%): Shallower = more relevant -- **Frequency** (30%): More occurrences = more important -- **Kind** (30%): function/method > class > variable - -**Features**: -- Merges duplicate nodes by (file_path, start_line, end_line) -- Tracks all paths to each node -- Supports filtering by kind or file pattern -- Configurable score weights - -## Usage Example - -```python -import asyncio -from codexlens.lsp.standalone_manager import StandaloneLspManager -from codexlens.search.association_tree import ( - AssociationTreeBuilder, - ResultDeduplicator, -) - -async def main(): - # Initialize LSP manager - async with StandaloneLspManager(workspace_root="/path/to/project") as lsp: - # Create tree builder - builder = AssociationTreeBuilder(lsp, timeout=5.0) - - # Build tree from seed location - tree = await builder.build_tree( - seed_file_path="src/main.py", - seed_line=42, - seed_character=1, - max_depth=5, - expand_callers=True, # Find who calls this - expand_callees=True, # Find what this calls - ) - - print(f"Tree: {tree}") - print(f" Roots: {len(tree.roots)}") - print(f" Total nodes: {len(tree.all_nodes)}") - print(f" Edges: {len(tree.edges)}") - - # Deduplicate and score - deduplicator = ResultDeduplicator( - depth_weight=0.4, - frequency_weight=0.3, - kind_weight=0.3, - ) - - unique_nodes = deduplicator.deduplicate(tree, max_results=20) - - print(f"\nTop unique nodes:") - for node in unique_nodes[:10]: - print(f" {node.name} ({node.file_path}:{node.range.start_line})") - print(f" Depth: {node.min_depth}, Occurrences: {node.occurrences}, Score: {node.score:.2f}") - - # Filter by kind - functions_only = deduplicator.filter_by_kind(unique_nodes, ["function", "method"]) - print(f"\nFunctions/methods: {len(functions_only)}") - -asyncio.run(main()) -``` - -## Integration with Hybrid Search - -The association tree can be integrated with the hybrid search engine: - -```python -from codexlens.search.hybrid_search import HybridSearchEngine - -async def search_with_association_tree(query: str): - # 1. Get seed results from vector search - search_engine = HybridSearchEngine() - seed_results = await search_engine.search(query, limit=5) - - # 2. Build association trees from top results - builder = AssociationTreeBuilder(lsp_manager) - trees = [] - - for result in seed_results: - tree = await builder.build_tree( - seed_file_path=result.file_path, - seed_line=result.line, - max_depth=3, - ) - trees.append(tree) - - # 3. Merge and deduplicate - merged_tree = merge_trees(trees) # Custom merge logic - deduplicator = ResultDeduplicator() - unique_nodes = deduplicator.deduplicate(merged_tree, max_results=50) - - # 4. Convert to search results - final_results = convert_to_search_results(unique_nodes) - - return final_results -``` - -## Testing - -Run the test suite: - -```bash -pytest tests/test_association_tree.py -v -``` - -Test coverage includes: -- Simple tree building -- Cycle detection -- Max depth limits -- Empty trees -- Deduplication logic -- Scoring algorithms -- Filtering operations - -## Performance Considerations - -1. **LSP Timeouts**: Set appropriate timeout values (default 5s) -2. **Max Depth**: Limit depth to avoid exponential expansion (recommended: 3-5) -3. **Caching**: LSP manager caches open documents -4. **Parallel Expansion**: Incoming/outgoing calls fetched in parallel - -## Error Handling - -The builder gracefully handles: -- LSP timeout errors (logs warning, continues) -- Missing call hierarchy support (returns empty) -- Network/connection failures (skips node) -- Invalid LSP responses (logs error, skips) - -## Future Enhancements - -- [ ] Multi-root tree building from multiple seeds -- [ ] Custom scoring functions -- [ ] Graph visualization export -- [ ] Incremental tree updates -- [ ] Cross-file relationship analysis diff --git a/codex-lens/src/codexlens/search/association_tree/__init__.py b/codex-lens/src/codexlens/search/association_tree/__init__.py deleted file mode 100644 index 9557af33..00000000 --- a/codex-lens/src/codexlens/search/association_tree/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -"""Association tree module for LSP-based code relationship discovery. - -This module provides components for building and processing call association trees -using Language Server Protocol (LSP) call hierarchy capabilities. -""" - -from .builder import AssociationTreeBuilder -from .data_structures import ( - CallTree, - TreeNode, - UniqueNode, -) -from .deduplicator import ResultDeduplicator - -__all__ = [ - "AssociationTreeBuilder", - "CallTree", - "TreeNode", - "UniqueNode", - "ResultDeduplicator", -] diff --git a/codex-lens/src/codexlens/search/association_tree/builder.py b/codex-lens/src/codexlens/search/association_tree/builder.py deleted file mode 100644 index 894a8e20..00000000 --- a/codex-lens/src/codexlens/search/association_tree/builder.py +++ /dev/null @@ -1,450 +0,0 @@ -"""Association tree builder using LSP call hierarchy. - -Builds call relationship trees by recursively expanding from seed locations -using Language Server Protocol (LSP) call hierarchy capabilities. -""" - -from __future__ import annotations - -import asyncio -import logging -from pathlib import Path -from typing import Dict, List, Optional, Set - -from codexlens.hybrid_search.data_structures import CallHierarchyItem, Range -from codexlens.lsp.standalone_manager import StandaloneLspManager -from .data_structures import CallTree, TreeNode - -logger = logging.getLogger(__name__) - - -class AssociationTreeBuilder: - """Builds association trees from seed locations using LSP call hierarchy. - - Uses depth-first recursive expansion to build a tree of code relationships - starting from seed locations (typically from vector search results). - - Strategy: - - Start from seed locations (vector search results) - - For each seed, get call hierarchy items via LSP - - Recursively expand incoming calls (callers) if expand_callers=True - - Recursively expand outgoing calls (callees) if expand_callees=True - - Track visited nodes to prevent cycles - - Stop at max_depth or when no more relations found - - Attributes: - lsp_manager: StandaloneLspManager for LSP communication - visited: Set of visited node IDs to prevent cycles - timeout: Timeout for individual LSP requests (seconds) - """ - - def __init__( - self, - lsp_manager: StandaloneLspManager, - timeout: float = 5.0, - analysis_wait: float = 2.0, - ): - """Initialize AssociationTreeBuilder. - - Args: - lsp_manager: StandaloneLspManager instance for LSP communication - timeout: Timeout for individual LSP requests in seconds - analysis_wait: Time to wait for LSP analysis on first file (seconds) - """ - self.lsp_manager = lsp_manager - self.timeout = timeout - self.analysis_wait = analysis_wait - self.visited: Set[str] = set() - self._analyzed_files: Set[str] = set() # Track files already analyzed - - async def build_tree( - self, - seed_file_path: str, - seed_line: int, - seed_character: int = 1, - max_depth: int = 5, - expand_callers: bool = True, - expand_callees: bool = True, - ) -> CallTree: - """Build call tree from a single seed location. - - Args: - seed_file_path: Path to the seed file - seed_line: Line number of the seed symbol (1-based) - seed_character: Character position (1-based, default 1) - max_depth: Maximum recursion depth (default 5) - expand_callers: Whether to expand incoming calls (callers) - expand_callees: Whether to expand outgoing calls (callees) - - Returns: - CallTree containing all discovered nodes and relationships - """ - tree = CallTree() - self.visited.clear() - - # Determine wait time - only wait for analysis on first encounter of file - wait_time = 0.0 - if seed_file_path not in self._analyzed_files: - wait_time = self.analysis_wait - self._analyzed_files.add(seed_file_path) - - # Get call hierarchy items for the seed position - try: - hierarchy_items = await asyncio.wait_for( - self.lsp_manager.get_call_hierarchy_items( - file_path=seed_file_path, - line=seed_line, - character=seed_character, - wait_for_analysis=wait_time, - ), - timeout=self.timeout + wait_time, - ) - except asyncio.TimeoutError: - logger.warning( - "Timeout getting call hierarchy items for %s:%d", - seed_file_path, - seed_line, - ) - return tree - except Exception as e: - logger.error( - "Error getting call hierarchy items for %s:%d: %s", - seed_file_path, - seed_line, - e, - ) - return tree - - if not hierarchy_items: - logger.debug( - "No call hierarchy items found for %s:%d", - seed_file_path, - seed_line, - ) - return tree - - # Create root nodes from hierarchy items - for item_dict in hierarchy_items: - # Convert LSP dict to CallHierarchyItem - item = self._dict_to_call_hierarchy_item(item_dict) - if not item: - continue - - root_node = TreeNode( - item=item, - depth=0, - path_from_root=[self._create_node_id(item)], - ) - tree.roots.append(root_node) - tree.add_node(root_node) - - # Mark as visited - self.visited.add(root_node.node_id) - - # Recursively expand the tree - await self._expand_node( - node=root_node, - node_dict=item_dict, - tree=tree, - current_depth=0, - max_depth=max_depth, - expand_callers=expand_callers, - expand_callees=expand_callees, - ) - - tree.depth_reached = max_depth - return tree - - async def _expand_node( - self, - node: TreeNode, - node_dict: Dict, - tree: CallTree, - current_depth: int, - max_depth: int, - expand_callers: bool, - expand_callees: bool, - ) -> None: - """Recursively expand a node by fetching its callers and callees. - - Args: - node: TreeNode to expand - node_dict: LSP CallHierarchyItem dict (for LSP requests) - tree: CallTree to add discovered nodes to - current_depth: Current recursion depth - max_depth: Maximum allowed depth - expand_callers: Whether to expand incoming calls - expand_callees: Whether to expand outgoing calls - """ - # Stop if max depth reached - if current_depth >= max_depth: - return - - # Prepare tasks for parallel expansion - tasks = [] - - if expand_callers: - tasks.append( - self._expand_incoming_calls( - node=node, - node_dict=node_dict, - tree=tree, - current_depth=current_depth, - max_depth=max_depth, - expand_callers=expand_callers, - expand_callees=expand_callees, - ) - ) - - if expand_callees: - tasks.append( - self._expand_outgoing_calls( - node=node, - node_dict=node_dict, - tree=tree, - current_depth=current_depth, - max_depth=max_depth, - expand_callers=expand_callers, - expand_callees=expand_callees, - ) - ) - - # Execute expansions in parallel - if tasks: - await asyncio.gather(*tasks, return_exceptions=True) - - async def _expand_incoming_calls( - self, - node: TreeNode, - node_dict: Dict, - tree: CallTree, - current_depth: int, - max_depth: int, - expand_callers: bool, - expand_callees: bool, - ) -> None: - """Expand incoming calls (callers) for a node. - - Args: - node: TreeNode being expanded - node_dict: LSP dict for the node - tree: CallTree to add nodes to - current_depth: Current depth - max_depth: Maximum depth - expand_callers: Whether to continue expanding callers - expand_callees: Whether to expand callees - """ - try: - incoming_calls = await asyncio.wait_for( - self.lsp_manager.get_incoming_calls(item=node_dict), - timeout=self.timeout, - ) - except asyncio.TimeoutError: - logger.debug("Timeout getting incoming calls for %s", node.node_id) - return - except Exception as e: - logger.debug("Error getting incoming calls for %s: %s", node.node_id, e) - return - - if not incoming_calls: - return - - # Process each incoming call - for call_dict in incoming_calls: - caller_dict = call_dict.get("from") - if not caller_dict: - continue - - # Convert to CallHierarchyItem - caller_item = self._dict_to_call_hierarchy_item(caller_dict) - if not caller_item: - continue - - caller_id = self._create_node_id(caller_item) - - # Check for cycles - if caller_id in self.visited: - # Create cycle marker node - cycle_node = TreeNode( - item=caller_item, - depth=current_depth + 1, - is_cycle=True, - path_from_root=node.path_from_root + [caller_id], - ) - node.parents.append(cycle_node) - continue - - # Create new caller node - caller_node = TreeNode( - item=caller_item, - depth=current_depth + 1, - path_from_root=node.path_from_root + [caller_id], - ) - - # Add to tree - tree.add_node(caller_node) - tree.add_edge(caller_node, node) - - # Update relationships - node.parents.append(caller_node) - caller_node.children.append(node) - - # Mark as visited - self.visited.add(caller_id) - - # Recursively expand the caller - await self._expand_node( - node=caller_node, - node_dict=caller_dict, - tree=tree, - current_depth=current_depth + 1, - max_depth=max_depth, - expand_callers=expand_callers, - expand_callees=expand_callees, - ) - - async def _expand_outgoing_calls( - self, - node: TreeNode, - node_dict: Dict, - tree: CallTree, - current_depth: int, - max_depth: int, - expand_callers: bool, - expand_callees: bool, - ) -> None: - """Expand outgoing calls (callees) for a node. - - Args: - node: TreeNode being expanded - node_dict: LSP dict for the node - tree: CallTree to add nodes to - current_depth: Current depth - max_depth: Maximum depth - expand_callers: Whether to expand callers - expand_callees: Whether to continue expanding callees - """ - try: - outgoing_calls = await asyncio.wait_for( - self.lsp_manager.get_outgoing_calls(item=node_dict), - timeout=self.timeout, - ) - except asyncio.TimeoutError: - logger.debug("Timeout getting outgoing calls for %s", node.node_id) - return - except Exception as e: - logger.debug("Error getting outgoing calls for %s: %s", node.node_id, e) - return - - if not outgoing_calls: - return - - # Process each outgoing call - for call_dict in outgoing_calls: - callee_dict = call_dict.get("to") - if not callee_dict: - continue - - # Convert to CallHierarchyItem - callee_item = self._dict_to_call_hierarchy_item(callee_dict) - if not callee_item: - continue - - callee_id = self._create_node_id(callee_item) - - # Check for cycles - if callee_id in self.visited: - # Create cycle marker node - cycle_node = TreeNode( - item=callee_item, - depth=current_depth + 1, - is_cycle=True, - path_from_root=node.path_from_root + [callee_id], - ) - node.children.append(cycle_node) - continue - - # Create new callee node - callee_node = TreeNode( - item=callee_item, - depth=current_depth + 1, - path_from_root=node.path_from_root + [callee_id], - ) - - # Add to tree - tree.add_node(callee_node) - tree.add_edge(node, callee_node) - - # Update relationships - node.children.append(callee_node) - callee_node.parents.append(node) - - # Mark as visited - self.visited.add(callee_id) - - # Recursively expand the callee - await self._expand_node( - node=callee_node, - node_dict=callee_dict, - tree=tree, - current_depth=current_depth + 1, - max_depth=max_depth, - expand_callers=expand_callers, - expand_callees=expand_callees, - ) - - def _dict_to_call_hierarchy_item( - self, item_dict: Dict - ) -> Optional[CallHierarchyItem]: - """Convert LSP dict to CallHierarchyItem. - - Args: - item_dict: LSP CallHierarchyItem dictionary - - Returns: - CallHierarchyItem or None if conversion fails - """ - try: - # Extract URI and convert to file path - uri = item_dict.get("uri", "") - file_path = uri.replace("file:///", "").replace("file://", "") - - # Handle Windows paths (file:///C:/...) - if len(file_path) > 2 and file_path[0] == "/" and file_path[2] == ":": - file_path = file_path[1:] - - # Extract range - range_dict = item_dict.get("range", {}) - start = range_dict.get("start", {}) - end = range_dict.get("end", {}) - - # Create Range (convert from 0-based to 1-based) - item_range = Range( - start_line=start.get("line", 0) + 1, - start_character=start.get("character", 0) + 1, - end_line=end.get("line", 0) + 1, - end_character=end.get("character", 0) + 1, - ) - - return CallHierarchyItem( - name=item_dict.get("name", "unknown"), - kind=str(item_dict.get("kind", "unknown")), - file_path=file_path, - range=item_range, - detail=item_dict.get("detail"), - ) - - except Exception as e: - logger.debug("Failed to convert dict to CallHierarchyItem: %s", e) - return None - - def _create_node_id(self, item: CallHierarchyItem) -> str: - """Create unique node ID from CallHierarchyItem. - - Args: - item: CallHierarchyItem - - Returns: - Unique node ID string - """ - return f"{item.file_path}:{item.name}:{item.range.start_line}" diff --git a/codex-lens/src/codexlens/search/association_tree/data_structures.py b/codex-lens/src/codexlens/search/association_tree/data_structures.py deleted file mode 100644 index 2c8b47fa..00000000 --- a/codex-lens/src/codexlens/search/association_tree/data_structures.py +++ /dev/null @@ -1,191 +0,0 @@ -"""Data structures for association tree building. - -Defines the core data classes for representing call hierarchy trees and -deduplicated results. -""" - -from __future__ import annotations - -from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional - -from codexlens.hybrid_search.data_structures import CallHierarchyItem, Range - - -@dataclass -class TreeNode: - """Node in the call association tree. - - Represents a single function/method in the tree, including its position - in the hierarchy and relationships. - - Attributes: - item: LSP CallHierarchyItem containing symbol information - depth: Distance from the root node (seed) - 0 for roots - children: List of child nodes (functions called by this node) - parents: List of parent nodes (functions that call this node) - is_cycle: Whether this node creates a circular reference - path_from_root: Path (list of node IDs) from root to this node - """ - - item: CallHierarchyItem - depth: int = 0 - children: List[TreeNode] = field(default_factory=list) - parents: List[TreeNode] = field(default_factory=list) - is_cycle: bool = False - path_from_root: List[str] = field(default_factory=list) - - @property - def node_id(self) -> str: - """Unique identifier for this node.""" - return f"{self.item.file_path}:{self.item.name}:{self.item.range.start_line}" - - def __hash__(self) -> int: - """Hash based on node ID.""" - return hash(self.node_id) - - def __eq__(self, other: object) -> bool: - """Equality based on node ID.""" - if not isinstance(other, TreeNode): - return False - return self.node_id == other.node_id - - def __repr__(self) -> str: - """String representation of the node.""" - cycle_marker = " [CYCLE]" if self.is_cycle else "" - return f"TreeNode({self.item.name}@{self.item.file_path}:{self.item.range.start_line}){cycle_marker}" - - -@dataclass -class CallTree: - """Complete call tree structure built from seeds. - - Contains all nodes discovered through recursive expansion and - the relationships between them. - - Attributes: - roots: List of root nodes (seed symbols) - all_nodes: Dictionary mapping node_id -> TreeNode for quick lookup - node_list: Flat list of all nodes in tree order - edges: List of (from_node_id, to_node_id) tuples representing calls - depth_reached: Maximum depth achieved in expansion - """ - - roots: List[TreeNode] = field(default_factory=list) - all_nodes: Dict[str, TreeNode] = field(default_factory=dict) - node_list: List[TreeNode] = field(default_factory=list) - edges: List[tuple[str, str]] = field(default_factory=list) - depth_reached: int = 0 - - def add_node(self, node: TreeNode) -> None: - """Add a node to the tree. - - Args: - node: TreeNode to add - """ - if node.node_id not in self.all_nodes: - self.all_nodes[node.node_id] = node - self.node_list.append(node) - - def add_edge(self, from_node: TreeNode, to_node: TreeNode) -> None: - """Add an edge between two nodes. - - Args: - from_node: Source node - to_node: Target node - """ - edge = (from_node.node_id, to_node.node_id) - if edge not in self.edges: - self.edges.append(edge) - - def get_node(self, node_id: str) -> Optional[TreeNode]: - """Get a node by ID. - - Args: - node_id: Node identifier - - Returns: - TreeNode if found, None otherwise - """ - return self.all_nodes.get(node_id) - - def __len__(self) -> int: - """Return total number of nodes in tree.""" - return len(self.all_nodes) - - def __repr__(self) -> str: - """String representation of the tree.""" - return ( - f"CallTree(roots={len(self.roots)}, nodes={len(self.all_nodes)}, " - f"depth={self.depth_reached})" - ) - - -@dataclass -class UniqueNode: - """Deduplicated unique code symbol from the tree. - - Represents a single unique code location that may appear multiple times - in the tree under different contexts. Contains aggregated information - about all occurrences. - - Attributes: - file_path: Absolute path to the file - name: Symbol name (function, method, class, etc.) - kind: Symbol kind (function, method, class, etc.) - range: Code range in the file - min_depth: Minimum depth at which this node appears in the tree - occurrences: Number of times this node appears in the tree - paths: List of paths from roots to this node - context_nodes: Related nodes from the tree - score: Composite relevance score (higher is better) - """ - - file_path: str - name: str - kind: str - range: Range - min_depth: int = 0 - occurrences: int = 1 - paths: List[List[str]] = field(default_factory=list) - context_nodes: List[str] = field(default_factory=list) - score: float = 0.0 - - @property - def node_key(self) -> tuple[str, int, int]: - """Unique key for deduplication. - - Uses (file_path, start_line, end_line) as the unique identifier - for this symbol across all occurrences. - """ - return ( - self.file_path, - self.range.start_line, - self.range.end_line, - ) - - def add_path(self, path: List[str]) -> None: - """Add a path from root to this node. - - Args: - path: List of node IDs from root to this node - """ - if path not in self.paths: - self.paths.append(path) - - def __hash__(self) -> int: - """Hash based on node key.""" - return hash(self.node_key) - - def __eq__(self, other: object) -> bool: - """Equality based on node key.""" - if not isinstance(other, UniqueNode): - return False - return self.node_key == other.node_key - - def __repr__(self) -> str: - """String representation of the unique node.""" - return ( - f"UniqueNode({self.name}@{self.file_path}:{self.range.start_line}, " - f"depth={self.min_depth}, occ={self.occurrences}, score={self.score:.2f})" - ) diff --git a/codex-lens/src/codexlens/search/association_tree/deduplicator.py b/codex-lens/src/codexlens/search/association_tree/deduplicator.py deleted file mode 100644 index 9e590518..00000000 --- a/codex-lens/src/codexlens/search/association_tree/deduplicator.py +++ /dev/null @@ -1,301 +0,0 @@ -"""Result deduplication for association tree nodes. - -Provides functionality to extract unique nodes from a call tree and assign -relevance scores based on various factors. -""" - -from __future__ import annotations - -import logging -from typing import Dict, List, Optional - -from .data_structures import ( - CallTree, - TreeNode, - UniqueNode, -) - -logger = logging.getLogger(__name__) - - -# Symbol kind weights for scoring (higher = more relevant) -KIND_WEIGHTS: Dict[str, float] = { - # Functions and methods are primary targets - "function": 1.0, - "method": 1.0, - "12": 1.0, # LSP SymbolKind.Function - "6": 1.0, # LSP SymbolKind.Method - # Classes are important but secondary - "class": 0.8, - "5": 0.8, # LSP SymbolKind.Class - # Interfaces and types - "interface": 0.7, - "11": 0.7, # LSP SymbolKind.Interface - "type": 0.6, - # Constructors - "constructor": 0.9, - "9": 0.9, # LSP SymbolKind.Constructor - # Variables and constants - "variable": 0.4, - "13": 0.4, # LSP SymbolKind.Variable - "constant": 0.5, - "14": 0.5, # LSP SymbolKind.Constant - # Default for unknown kinds - "unknown": 0.3, -} - - -class ResultDeduplicator: - """Extracts and scores unique nodes from call trees. - - Processes a CallTree to extract unique code locations, merging duplicates - and assigning relevance scores based on: - - Depth: Shallower nodes (closer to seeds) score higher - - Frequency: Nodes appearing multiple times score higher - - Kind: Function/method > class > variable - - Attributes: - depth_weight: Weight for depth factor in scoring (default 0.4) - frequency_weight: Weight for frequency factor (default 0.3) - kind_weight: Weight for symbol kind factor (default 0.3) - max_depth_penalty: Maximum depth before full penalty applied - """ - - def __init__( - self, - depth_weight: float = 0.4, - frequency_weight: float = 0.3, - kind_weight: float = 0.3, - max_depth_penalty: int = 10, - ): - """Initialize ResultDeduplicator. - - Args: - depth_weight: Weight for depth factor (0.0-1.0) - frequency_weight: Weight for frequency factor (0.0-1.0) - kind_weight: Weight for symbol kind factor (0.0-1.0) - max_depth_penalty: Depth at which score becomes 0 for depth factor - """ - self.depth_weight = depth_weight - self.frequency_weight = frequency_weight - self.kind_weight = kind_weight - self.max_depth_penalty = max_depth_penalty - - def deduplicate( - self, - tree: CallTree, - max_results: Optional[int] = None, - ) -> List[UniqueNode]: - """Extract unique nodes from the call tree. - - Traverses the tree, groups nodes by their unique key (file_path, - start_line, end_line), and merges duplicate occurrences. - - Args: - tree: CallTree to process - max_results: Maximum number of results to return (None = all) - - Returns: - List of UniqueNode objects, sorted by score descending - """ - if not tree.node_list: - return [] - - # Group nodes by unique key - unique_map: Dict[tuple, UniqueNode] = {} - - for node in tree.node_list: - if node.is_cycle: - # Skip cycle markers - they point to already-counted nodes - continue - - key = self._get_node_key(node) - - if key in unique_map: - # Update existing unique node - unique_node = unique_map[key] - unique_node.occurrences += 1 - unique_node.min_depth = min(unique_node.min_depth, node.depth) - unique_node.add_path(node.path_from_root) - - # Collect context from relationships - for parent in node.parents: - if not parent.is_cycle: - unique_node.context_nodes.append(parent.node_id) - for child in node.children: - if not child.is_cycle: - unique_node.context_nodes.append(child.node_id) - else: - # Create new unique node - unique_node = UniqueNode( - file_path=node.item.file_path, - name=node.item.name, - kind=node.item.kind, - range=node.item.range, - min_depth=node.depth, - occurrences=1, - paths=[node.path_from_root.copy()], - context_nodes=[], - score=0.0, - ) - - # Collect initial context - for parent in node.parents: - if not parent.is_cycle: - unique_node.context_nodes.append(parent.node_id) - for child in node.children: - if not child.is_cycle: - unique_node.context_nodes.append(child.node_id) - - unique_map[key] = unique_node - - # Calculate scores for all unique nodes - unique_nodes = list(unique_map.values()) - - # Find max frequency for normalization - max_frequency = max((n.occurrences for n in unique_nodes), default=1) - - for node in unique_nodes: - node.score = self._score_node(node, max_frequency) - - # Sort by score descending - unique_nodes.sort(key=lambda n: n.score, reverse=True) - - # Apply max_results limit - if max_results is not None and max_results > 0: - unique_nodes = unique_nodes[:max_results] - - logger.debug( - "Deduplicated %d tree nodes to %d unique nodes", - len(tree.node_list), - len(unique_nodes), - ) - - return unique_nodes - - def _score_node( - self, - node: UniqueNode, - max_frequency: int, - ) -> float: - """Calculate composite score for a unique node. - - Score = depth_weight * depth_score + - frequency_weight * frequency_score + - kind_weight * kind_score - - Args: - node: UniqueNode to score - max_frequency: Maximum occurrence count for normalization - - Returns: - Composite score between 0.0 and 1.0 - """ - # Depth score: closer to root = higher score - # Score of 1.0 at depth 0, decreasing to 0.0 at max_depth_penalty - depth_score = max( - 0.0, - 1.0 - (node.min_depth / self.max_depth_penalty), - ) - - # Frequency score: more occurrences = higher score - frequency_score = node.occurrences / max_frequency if max_frequency > 0 else 0.0 - - # Kind score: function/method > class > variable - kind_str = str(node.kind).lower() - kind_score = KIND_WEIGHTS.get(kind_str, KIND_WEIGHTS["unknown"]) - - # Composite score - score = ( - self.depth_weight * depth_score - + self.frequency_weight * frequency_score - + self.kind_weight * kind_score - ) - - return score - - def _get_node_key(self, node: TreeNode) -> tuple: - """Get unique key for a tree node. - - Uses (file_path, start_line, end_line) as the unique identifier. - - Args: - node: TreeNode - - Returns: - Tuple key for deduplication - """ - return ( - node.item.file_path, - node.item.range.start_line, - node.item.range.end_line, - ) - - def filter_by_kind( - self, - nodes: List[UniqueNode], - kinds: List[str], - ) -> List[UniqueNode]: - """Filter unique nodes by symbol kind. - - Args: - nodes: List of UniqueNode to filter - kinds: List of allowed kinds (e.g., ["function", "method"]) - - Returns: - Filtered list of UniqueNode - """ - kinds_lower = [k.lower() for k in kinds] - return [ - node - for node in nodes - if str(node.kind).lower() in kinds_lower - ] - - def filter_by_file( - self, - nodes: List[UniqueNode], - file_patterns: List[str], - ) -> List[UniqueNode]: - """Filter unique nodes by file path patterns. - - Args: - nodes: List of UniqueNode to filter - file_patterns: List of path substrings to match - - Returns: - Filtered list of UniqueNode - """ - return [ - node - for node in nodes - if any(pattern in node.file_path for pattern in file_patterns) - ] - - def to_dict_list(self, nodes: List[UniqueNode]) -> List[Dict]: - """Convert list of UniqueNode to JSON-serializable dicts. - - Args: - nodes: List of UniqueNode - - Returns: - List of dictionaries - """ - return [ - { - "file_path": node.file_path, - "name": node.name, - "kind": node.kind, - "range": { - "start_line": node.range.start_line, - "start_character": node.range.start_character, - "end_line": node.range.end_line, - "end_character": node.range.end_character, - }, - "min_depth": node.min_depth, - "occurrences": node.occurrences, - "path_count": len(node.paths), - "score": round(node.score, 4), - } - for node in nodes - ] diff --git a/codex-lens/src/codexlens/search/binary_searcher.py b/codex-lens/src/codexlens/search/binary_searcher.py deleted file mode 100644 index 30ab55b3..00000000 --- a/codex-lens/src/codexlens/search/binary_searcher.py +++ /dev/null @@ -1,309 +0,0 @@ -"""Binary vector searcher for cascade search. - -This module provides fast binary vector search using Hamming distance -for the first stage of cascade search (coarse filtering). - -Supports two loading modes: -1. Memory-mapped file (preferred): Low memory footprint, OS-managed paging -2. Database loading (fallback): Loads all vectors into RAM -""" - -from __future__ import annotations - -import json -import logging -from pathlib import Path -from typing import List, Optional, Tuple - -import numpy as np - -logger = logging.getLogger(__name__) - -# Pre-computed popcount lookup table for vectorized Hamming distance -# Each byte value (0-255) maps to its bit count -_POPCOUNT_TABLE = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8) - - -class BinarySearcher: - """Fast binary vector search using Hamming distance. - - This class implements the first stage of cascade search: - fast, approximate retrieval using binary vectors and Hamming distance. - - The binary vectors are derived from dense embeddings by thresholding: - binary[i] = 1 if dense[i] > 0 else 0 - - Hamming distance between two binary vectors counts the number of - differing bits, which can be computed very efficiently using XOR - and population count. - - Supports two loading modes: - - Memory-mapped file (preferred): Uses np.memmap for minimal RAM usage - - Database (fallback): Loads all vectors into memory from SQLite - """ - - def __init__(self, index_root_or_meta_path: Path) -> None: - """Initialize BinarySearcher. - - Args: - index_root_or_meta_path: Either: - - Path to index root directory (containing _binary_vectors.mmap) - - Path to _vectors_meta.db (legacy mode, loads from DB) - """ - path = Path(index_root_or_meta_path) - - # Determine if this is an index root or a specific DB path - if path.suffix == '.db': - # Legacy mode: specific DB path - self.index_root = path.parent - self.meta_store_path = path - else: - # New mode: index root directory - self.index_root = path - self.meta_store_path = path / "_vectors_meta.db" - - self._chunk_ids: Optional[np.ndarray] = None - self._binary_matrix: Optional[np.ndarray] = None - self._is_memmap = False - self._loaded = False - self._embedding_dim: Optional[int] = None - self._backend: Optional[str] = None - self._model: Optional[str] = None - self._model_profile: Optional[str] = None - - def load(self) -> bool: - """Load binary vectors using memory-mapped file or database fallback. - - Tries to load from memory-mapped file first (preferred for large indexes), - falls back to database loading if mmap file doesn't exist. - - Returns: - True if vectors were loaded successfully. - """ - if self._loaded: - return True - - # Try memory-mapped file first (preferred) - mmap_path = self.index_root / "_binary_vectors.mmap" - meta_path = mmap_path.with_suffix('.meta.json') - - if mmap_path.exists() and meta_path.exists(): - try: - with open(meta_path, 'r') as f: - meta = json.load(f) - - shape = tuple(meta['shape']) - self._chunk_ids = np.array(meta['chunk_ids'], dtype=np.int64) - self._embedding_dim = meta.get("embedding_dim") - self._backend = meta.get("backend") - self._model = meta.get("model") or meta.get("model_name") - self._model_profile = meta.get("model_profile") - - # Memory-map the binary matrix (read-only) - self._binary_matrix = np.memmap( - str(mmap_path), - dtype=np.uint8, - mode='r', - shape=shape - ) - self._is_memmap = True - self._loaded = True - - logger.info( - "Memory-mapped %d binary vectors (%d bytes each)", - len(self._chunk_ids), shape[1] - ) - return True - - except Exception as e: - logger.warning("Failed to load mmap binary vectors, falling back to DB: %s", e) - - # Fallback: load from database - return self._load_from_db() - - def _load_from_db(self) -> bool: - """Load binary vectors from database (legacy/fallback mode). - - Returns: - True if vectors were loaded successfully. - """ - try: - from codexlens.storage.vector_meta_store import VectorMetadataStore - - with VectorMetadataStore(self.meta_store_path) as store: - rows = store.get_all_binary_vectors() - - if not rows: - logger.warning("No binary vectors found in %s", self.meta_store_path) - return False - - # Convert to numpy arrays for fast computation - self._chunk_ids = np.array([r[0] for r in rows], dtype=np.int64) - - # Unpack bytes to numpy array - binary_arrays = [] - for _, vec_bytes in rows: - arr = np.frombuffer(vec_bytes, dtype=np.uint8) - binary_arrays.append(arr) - - self._binary_matrix = np.vstack(binary_arrays) - self._is_memmap = False - self._loaded = True - self._embedding_dim = None - self._backend = None - self._model = None - self._model_profile = None - - logger.info( - "Loaded %d binary vectors from DB (%d bytes each)", - len(self._chunk_ids), self._binary_matrix.shape[1] - ) - return True - - except Exception as e: - logger.error("Failed to load binary vectors: %s", e) - return False - - def search( - self, - query_vector: np.ndarray, - top_k: int = 100 - ) -> List[Tuple[int, int]]: - """Search for similar vectors using Hamming distance. - - Args: - query_vector: Dense query vector (will be binarized). - top_k: Number of top results to return. - - Returns: - List of (chunk_id, hamming_distance) tuples sorted by distance. - """ - if not self._loaded and not self.load(): - return [] - - # Binarize query vector - query_binary = (query_vector > 0).astype(np.uint8) - query_packed = np.packbits(query_binary) - - # Compute Hamming distances using XOR and popcount - # XOR gives 1 for differing bits - xor_result = np.bitwise_xor(self._binary_matrix, query_packed) - - # Vectorized popcount using lookup table (orders of magnitude faster) - # Sum the bit counts for each byte across all columns - distances = np.sum(_POPCOUNT_TABLE[xor_result], axis=1, dtype=np.int32) - - # Get top-k with smallest distances - if top_k >= len(distances): - top_indices = np.argsort(distances) - else: - # Partial sort for efficiency - top_indices = np.argpartition(distances, top_k)[:top_k] - top_indices = top_indices[np.argsort(distances[top_indices])] - - results = [ - (int(self._chunk_ids[i]), int(distances[i])) - for i in top_indices - ] - - return results - - def search_with_rerank( - self, - query_dense: np.ndarray, - dense_vectors: np.ndarray, - dense_chunk_ids: np.ndarray, - top_k: int = 10, - candidates: int = 100 - ) -> List[Tuple[int, float]]: - """Two-stage cascade search: binary filter + dense rerank. - - Args: - query_dense: Dense query vector. - dense_vectors: Dense vectors for reranking (from HNSW or stored). - dense_chunk_ids: Chunk IDs corresponding to dense_vectors. - top_k: Final number of results. - candidates: Number of candidates from binary search. - - Returns: - List of (chunk_id, cosine_similarity) tuples. - """ - # Stage 1: Binary filtering - binary_results = self.search(query_dense, top_k=candidates) - if not binary_results: - return [] - - candidate_ids = {r[0] for r in binary_results} - - # Stage 2: Dense reranking - # Find indices of candidates in dense_vectors - candidate_mask = np.isin(dense_chunk_ids, list(candidate_ids)) - candidate_indices = np.where(candidate_mask)[0] - - if len(candidate_indices) == 0: - # Fallback: return binary results with normalized distance - max_dist = max(r[1] for r in binary_results) if binary_results else 1 - return [(r[0], 1.0 - r[1] / max_dist) for r in binary_results[:top_k]] - - # Compute cosine similarities for candidates - candidate_vectors = dense_vectors[candidate_indices] - candidate_ids_array = dense_chunk_ids[candidate_indices] - - # Normalize vectors - query_norm = query_dense / (np.linalg.norm(query_dense) + 1e-8) - cand_norms = candidate_vectors / ( - np.linalg.norm(candidate_vectors, axis=1, keepdims=True) + 1e-8 - ) - - # Cosine similarities - similarities = np.dot(cand_norms, query_norm) - - # Sort by similarity (descending) - sorted_indices = np.argsort(-similarities)[:top_k] - - results = [ - (int(candidate_ids_array[i]), float(similarities[i])) - for i in sorted_indices - ] - - return results - - @property - def vector_count(self) -> int: - """Get number of loaded binary vectors.""" - return len(self._chunk_ids) if self._chunk_ids is not None else 0 - - @property - def embedding_dim(self) -> Optional[int]: - """Embedding dimension used to build these binary vectors (if known).""" - return int(self._embedding_dim) if self._embedding_dim is not None else None - - @property - def backend(self) -> Optional[str]: - """Embedding backend used to build these vectors (if known).""" - return self._backend - - @property - def model(self) -> Optional[str]: - """Embedding model name used to build these vectors (if known).""" - return self._model - - @property - def model_profile(self) -> Optional[str]: - """Embedding profile name (fastembed) used to build these vectors (if known).""" - return self._model_profile - - @property - def is_memmap(self) -> bool: - """Check if using memory-mapped file (vs in-memory array).""" - return self._is_memmap - - def clear(self) -> None: - """Clear loaded vectors from memory.""" - # For memmap, just delete the reference (OS will handle cleanup) - if self._is_memmap and self._binary_matrix is not None: - del self._binary_matrix - self._chunk_ids = None - self._binary_matrix = None - self._is_memmap = False - self._loaded = False diff --git a/codex-lens/src/codexlens/search/chain_search.py b/codex-lens/src/codexlens/search/chain_search.py deleted file mode 100644 index c269af66..00000000 --- a/codex-lens/src/codexlens/search/chain_search.py +++ /dev/null @@ -1,4779 +0,0 @@ -"""Chain search engine for recursive multi-directory searching. - -Provides parallel search across directory hierarchies using indexed _index.db files. -Supports depth-limited traversal, result aggregation, and symbol search. -""" - -from __future__ import annotations - -from concurrent.futures import ThreadPoolExecutor, as_completed -from dataclasses import dataclass, field, replace -from pathlib import Path -from typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CHECKING -import json -import logging -import os -import threading -import time - -from codexlens.entities import SearchResult, Symbol - -if TYPE_CHECKING: - import numpy as np - -try: - import numpy as np - NUMPY_AVAILABLE = True -except ImportError: - NUMPY_AVAILABLE = False -from codexlens.config import Config -from codexlens.storage.registry import RegistryStore, DirMapping -from codexlens.storage.dir_index import DirIndexStore, SubdirLink -from codexlens.storage.global_index import GlobalSymbolIndex -from codexlens.storage.index_filters import is_ignored_index_path -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.sqlite_store import SQLiteStore -from codexlens.storage.vector_meta_store import VectorMetadataStore -from codexlens.config import ( - BINARY_VECTORS_MMAP_NAME, - VECTORS_HNSW_NAME, - VECTORS_META_DB_NAME, -) -from codexlens.search.hybrid_search import HybridSearchEngine -from codexlens.search.ranking import query_prefers_lexical_search - -SEARCH_ARTIFACT_DIRS = frozenset({ - "dist", - "build", - "out", - "target", - "bin", - "obj", - "_build", - "coverage", - "htmlcov", - ".cache", - ".parcel-cache", - ".turbo", - ".next", - ".nuxt", - "node_modules", - "bower_components", -}) - - -@dataclass -class SearchOptions: - """Configuration options for chain search. - - Attributes: - depth: Maximum search depth (-1 = unlimited, 0 = current dir only) - max_workers: Number of parallel worker threads - limit_per_dir: Maximum results per directory - total_limit: Total result limit across all directories - offset: Pagination offset - skip first N results (default 0) - include_symbols: Whether to include symbol search results - files_only: Return only file paths without excerpts - include_semantic: Whether to include semantic keyword search results - code_only: Only return code files (excludes md, txt, json, yaml, xml, etc.) - exclude_extensions: List of file extensions to exclude (e.g., ["md", "txt", "json"]) - hybrid_mode: Enable hybrid search with RRF fusion (default False) - enable_fuzzy: Enable fuzzy FTS in hybrid mode (default True) - enable_vector: Enable vector semantic search (default False) - pure_vector: If True, only use vector search without FTS fallback (default False) - enable_cascade: Enable cascade (binary+dense) two-stage retrieval (default False) - hybrid_weights: Custom RRF weights for hybrid search (optional) - group_results: Enable grouping of similar results (default False) - grouping_threshold: Score threshold for grouping similar results (default 0.01) - inject_feature_anchors: Whether to inject lexical feature anchors (default True) - """ - depth: int = -1 - max_workers: int = 8 - limit_per_dir: int = 10 - total_limit: int = 100 - offset: int = 0 - include_symbols: bool = False - files_only: bool = False - include_semantic: bool = False - code_only: bool = False - exclude_extensions: Optional[List[str]] = None - hybrid_mode: bool = False - enable_fuzzy: bool = True - enable_vector: bool = False - pure_vector: bool = False - enable_cascade: bool = False - hybrid_weights: Optional[Dict[str, float]] = None - group_results: bool = False - grouping_threshold: float = 0.01 - inject_feature_anchors: bool = True - - -@dataclass -class SearchStats: - """Statistics collected during search execution. - - Attributes: - dirs_searched: Number of directories searched - files_matched: Number of files with matches - time_ms: Total search time in milliseconds - errors: List of error messages encountered - """ - dirs_searched: int = 0 - files_matched: int = 0 - time_ms: float = 0 - errors: List[str] = field(default_factory=list) - - -@dataclass -class ChainSearchResult: - """Comprehensive search result with metadata. - - Attributes: - query: Original search query - results: List of SearchResult objects - related_results: Expanded results from graph neighbors (optional) - symbols: List of Symbol objects (if include_symbols=True) - stats: SearchStats with execution metrics - """ - query: str - results: List[SearchResult] - symbols: List[Symbol] - stats: SearchStats - related_results: List[SearchResult] = field(default_factory=list) - - -@dataclass -class ReferenceResult: - """Result from reference search in code_relationships table. - - Attributes: - file_path: Path to the file containing the reference - line: Line number where the reference occurs (1-based) - column: Column number where the reference occurs (0-based) - context: Surrounding code snippet for context - relationship_type: Type of relationship (call, import, inheritance, etc.) - """ - file_path: str - line: int - column: int - context: str - relationship_type: str - - -class ChainSearchEngine: - """Parallel chain search engine for hierarchical directory indexes. - - Searches across multiple directory indexes in parallel, following subdirectory - links to recursively traverse the file tree. Supports depth limits, result - aggregation, and both content and symbol searches. - - Thread-safe with configurable parallelism. - - Attributes: - registry: Global project registry - mapper: Path mapping utility - logger: Python logger instance - """ - - def __init__(self, - registry: RegistryStore, - mapper: PathMapper, - max_workers: int = 8, - config: Config | None = None): - """Initialize chain search engine. - - Args: - registry: Global project registry for path lookups - mapper: Path mapper for source/index conversions - max_workers: Maximum parallel workers (default 8) - """ - self.registry = registry - self.mapper = mapper - self.logger = logging.getLogger(__name__) - self._max_workers = max_workers - self._executor: Optional[ThreadPoolExecutor] = None - self._config = config - self._realtime_lsp_keepalive_lock = threading.RLock() - self._realtime_lsp_keepalive = None - self._realtime_lsp_keepalive_key = None - self._runtime_cache_lock = threading.RLock() - self._dense_ann_cache: Dict[Tuple[str, int], Any] = {} - self._legacy_dense_ann_cache: Dict[Tuple[str, int], Any] = {} - self._reranker_cache_key: Optional[Tuple[str, Optional[str], bool, Optional[int]]] = None - self._reranker_instance: Any = None - # Track which (workspace_root, config_file) pairs have already been warmed up. - # This avoids paying the warmup sleep on every query when using keep-alive LSP servers. - self._realtime_lsp_warmed_ids: set[tuple[str, str | None]] = set() - - def _get_executor(self, max_workers: Optional[int] = None) -> ThreadPoolExecutor: - """Get or create the shared thread pool executor. - - Lazy initialization to avoid creating executor if never used. - - Args: - max_workers: Override default max_workers if specified - - Returns: - ThreadPoolExecutor instance - """ - workers = max_workers or self._max_workers - if self._executor is None: - self._executor = ThreadPoolExecutor(max_workers=workers) - return self._executor - - def close(self) -> None: - """Shutdown the thread pool executor.""" - if self._executor is not None: - self._executor.shutdown(wait=True) - self._executor = None - self._clear_runtime_caches() - with self._realtime_lsp_keepalive_lock: - keepalive = self._realtime_lsp_keepalive - self._realtime_lsp_keepalive = None - self._realtime_lsp_keepalive_key = None - if keepalive is not None: - try: - keepalive.stop() - except Exception: - pass - - def __enter__(self) -> "ChainSearchEngine": - """Context manager entry.""" - return self - - def __exit__(self, exc_type: object, exc: object, tb: object) -> None: - """Context manager exit.""" - self.close() - - @staticmethod - def _release_cached_resource(resource: Any) -> None: - """Best-effort cleanup for cached runtime helpers.""" - if resource is None: - return - for attr_name in ("clear", "close"): - cleanup = getattr(resource, attr_name, None) - if callable(cleanup): - try: - cleanup() - except Exception: - pass - break - - def _clear_runtime_caches(self) -> None: - """Drop per-engine runtime caches for dense indexes and rerankers.""" - with self._runtime_cache_lock: - dense_indexes = list(self._dense_ann_cache.values()) - legacy_dense_indexes = list(self._legacy_dense_ann_cache.values()) - reranker = self._reranker_instance - self._dense_ann_cache = {} - self._legacy_dense_ann_cache = {} - self._reranker_cache_key = None - self._reranker_instance = None - - for resource in [*dense_indexes, *legacy_dense_indexes, reranker]: - self._release_cached_resource(resource) - - def _get_cached_centralized_dense_index(self, index_root: Path, dim: int) -> Optional[Any]: - """Load and cache a centralized dense ANN index for repeated queries.""" - from codexlens.semantic.ann_index import ANNIndex - - resolved_root = Path(index_root).resolve() - cache_key = (str(resolved_root), int(dim)) - with self._runtime_cache_lock: - cached = self._dense_ann_cache.get(cache_key) - if cached is not None: - return cached - - ann_index = ANNIndex.create_central(index_root=resolved_root, dim=int(dim)) - if not ann_index.load() or ann_index.count() == 0: - return None - - with self._runtime_cache_lock: - cached = self._dense_ann_cache.get(cache_key) - if cached is None: - self._dense_ann_cache[cache_key] = ann_index - cached = ann_index - return cached - - def _get_cached_legacy_dense_index(self, index_path: Path, dim: int) -> Optional[Any]: - """Load and cache a legacy per-index dense ANN index for repeated queries.""" - from codexlens.semantic.ann_index import ANNIndex - - resolved_path = Path(index_path).resolve() - cache_key = (str(resolved_path), int(dim)) - with self._runtime_cache_lock: - cached = self._legacy_dense_ann_cache.get(cache_key) - if cached is not None: - return cached - - ann_index = ANNIndex(resolved_path, dim=int(dim)) - if not ann_index.load() or ann_index.count() == 0: - return None - - with self._runtime_cache_lock: - cached = self._legacy_dense_ann_cache.get(cache_key) - if cached is None: - self._legacy_dense_ann_cache[cache_key] = ann_index - cached = ann_index - return cached - - def _get_cached_reranker(self) -> Any: - """Return a cached reranker instance for repeated cascade queries.""" - try: - from codexlens.semantic.reranker import ( - check_reranker_available, - get_reranker, - ) - except ImportError as exc: - self.logger.debug("Reranker not available: %s", exc) - return None - except Exception as exc: - self.logger.debug("Failed to import reranker factory: %s", exc) - return None - - backend = "onnx" - model_name = None - use_gpu = True - max_tokens = None - - if self._config is not None: - backend = getattr(self._config, "reranker_backend", "onnx") or "onnx" - model_name = getattr(self._config, "reranker_model", None) - use_gpu = getattr( - self._config, - "reranker_use_gpu", - getattr(self._config, "embedding_use_gpu", True), - ) - max_tokens = getattr(self._config, "reranker_max_input_tokens", None) - - cache_key = ( - str(backend).strip().lower(), - str(model_name).strip() if isinstance(model_name, str) and model_name.strip() else None, - bool(use_gpu), - int(max_tokens) if isinstance(max_tokens, (int, float)) else None, - ) - with self._runtime_cache_lock: - cached = ( - self._reranker_instance - if self._reranker_instance is not None and self._reranker_cache_key == cache_key - else None - ) - if cached is not None: - return cached - - ok, err = check_reranker_available(cache_key[0]) - if not ok: - self.logger.debug("Reranker backend unavailable (%s): %s", cache_key[0], err) - return None - - kwargs: Dict[str, Any] = {} - device = None - if cache_key[0] == "onnx": - kwargs["use_gpu"] = cache_key[2] - elif cache_key[0] == "api": - if cache_key[3] is not None: - kwargs["max_input_tokens"] = cache_key[3] - elif not cache_key[2]: - device = "cpu" - - try: - reranker = get_reranker( - backend=cache_key[0], - model_name=cache_key[1], - device=device, - **kwargs, - ) - except Exception as exc: - self.logger.debug("Failed to initialize reranker: %s", exc) - return None - - previous = None - with self._runtime_cache_lock: - cached = ( - self._reranker_instance - if self._reranker_instance is not None and self._reranker_cache_key == cache_key - else None - ) - if cached is not None: - reranker = cached - else: - previous = self._reranker_instance - self._reranker_cache_key = cache_key - self._reranker_instance = reranker - - if previous is not None and previous is not reranker: - self._release_cached_resource(previous) - return reranker - - def search(self, query: str, - source_path: Path, - options: Optional[SearchOptions] = None) -> ChainSearchResult: - """Execute chain search from source_path with recursive traversal. - - Process: - 1. Locate starting index for source_path - 2. Collect all child indexes based on depth limit - 3. Search indexes in parallel using ThreadPoolExecutor - 4. Aggregate, deduplicate, and rank results - - Args: - query: FTS5 search query string - source_path: Starting directory path - options: Search configuration (uses defaults if None) - - Returns: - ChainSearchResult with results, symbols, and statistics - - Examples: - >>> engine = ChainSearchEngine(registry, mapper) - >>> result = engine.search("authentication", Path("D:/project/src")) - >>> for r in result.results[:5]: - ... print(f"{r.path}: {r.score:.2f}") - """ - options = options or SearchOptions() - effective_options = options - if options.hybrid_mode and query_prefers_lexical_search(query): - self.logger.debug( - "Hybrid shortcut: using lexical search path for lexical-priority query %r", - query, - ) - effective_options = replace( - options, - hybrid_mode=False, - enable_vector=False, - pure_vector=False, - enable_cascade=False, - hybrid_weights=None, - enable_fuzzy=True, - ) - start_time = time.time() - stats = SearchStats() - - # Step 1: Find starting index - start_index = self._find_start_index(source_path) - if not start_index: - self.logger.warning(f"No index found for {source_path}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Step 2: Collect all index paths to search - index_paths = self._collect_index_paths(start_index, effective_options.depth) - stats.dirs_searched = len(index_paths) - - if not index_paths: - self.logger.warning(f"No indexes collected from {start_index}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Step 3: Parallel search - results, search_stats = self._search_parallel( - index_paths, query, effective_options - ) - stats.errors = search_stats.errors - - # Step 3.5: Filter by extension if requested - if effective_options.code_only or effective_options.exclude_extensions: - results = self._filter_by_extension( - results, effective_options.code_only, effective_options.exclude_extensions - ) - - if effective_options.inject_feature_anchors: - results = self._inject_query_feature_anchors( - query, - source_path, - effective_options, - results, - limit=min(6, max(2, effective_options.total_limit)), - ) - - # Step 4: Merge and rank - final_results = self._merge_and_rank( - results, - effective_options.total_limit, - effective_options.offset, - query=query, - ) - - # Step 5: Optional grouping of similar results - if effective_options.group_results: - from codexlens.search.ranking import group_similar_results - final_results = group_similar_results( - final_results, score_threshold_abs=effective_options.grouping_threshold - ) - - stats.files_matched = len(final_results) - - # Optional: Symbol search - symbols = [] - if effective_options.include_symbols: - symbols = self._search_symbols_parallel( - index_paths, query, None, effective_options.total_limit - ) - - # Optional: graph expansion using precomputed neighbors - related_results: List[SearchResult] = [] - if self._config is not None and getattr(self._config, "enable_graph_expansion", False): - try: - from codexlens.search.enrichment import SearchEnrichmentPipeline - - pipeline = SearchEnrichmentPipeline(self.mapper, config=self._config) - related_results = pipeline.expand_related_results(final_results) - except Exception as exc: - self.logger.debug("Graph expansion failed: %s", exc) - related_results = [] - - stats.time_ms = (time.time() - start_time) * 1000 - - return ChainSearchResult( - query=query, - results=final_results, - symbols=symbols, - stats=stats, - related_results=related_results, - ) - - def binary_cascade_search( - self, - query: str, - source_path: Path, - k: int = 10, - coarse_k: int = 100, - options: Optional[SearchOptions] = None, - ) -> ChainSearchResult: - """Execute binary cascade search with binary coarse ranking and dense fine ranking. - - Binary cascade search process: - 1. Stage 1 (Coarse): Fast binary vector search using Hamming distance - to quickly filter to coarse_k candidates (256-dim binary, 32 bytes/vector) - 2. Stage 2 (Fine): Dense vector cosine similarity for precise reranking - of candidates (2048-dim float32) - - This approach leverages the speed of binary search (~100x faster) while - maintaining precision through dense vector reranking. - - Performance characteristics: - - Binary search: O(N) with SIMD-accelerated XOR + popcount - - Dense rerank: Only applied to top coarse_k candidates - - Memory: 32 bytes (binary) + 8KB (dense) per chunk - - Args: - query: Natural language or keyword query string - source_path: Starting directory path - k: Number of final results to return (default 10) - coarse_k: Number of coarse candidates from first stage (default 100) - options: Search configuration (uses defaults if None) - - Returns: - ChainSearchResult with reranked results and statistics - - Examples: - >>> engine = ChainSearchEngine(registry, mapper, config=config) - >>> result = engine.binary_cascade_search( - ... "how to authenticate users", - ... Path("D:/project/src"), - ... k=10, - ... coarse_k=100 - ... ) - >>> for r in result.results: - ... print(f"{r.path}: {r.score:.3f}") - """ - if not NUMPY_AVAILABLE: - self.logger.warning( - "NumPy not available, falling back to standard search" - ) - return self.search(query, source_path, options=options) - - options = options or SearchOptions() - start_time = time.time() - stats = SearchStats() - - # Use config defaults if available - if self._config is not None: - if hasattr(self._config, "cascade_coarse_k"): - coarse_k = coarse_k or self._config.cascade_coarse_k - if hasattr(self._config, "cascade_fine_k"): - k = k or self._config.cascade_fine_k - - # Step 1: Find starting index - start_index = self._find_start_index(source_path) - if not start_index: - self.logger.warning(f"No index found for {source_path}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Step 2: Collect all index paths - index_paths = self._collect_index_paths(start_index, options.depth) - stats.dirs_searched = len(index_paths) - - if not index_paths: - self.logger.warning(f"No indexes collected from {start_index}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Stage 1: Binary vector coarse retrieval - self.logger.debug( - "Binary Cascade Stage 1: Binary coarse retrieval for %d candidates", - coarse_k, - ) - - coarse_candidates, used_centralized, _, stage2_index_root = self._collect_binary_coarse_candidates( - query, - index_paths, - coarse_k, - stats, - index_root=index_paths[0].parent if index_paths else None, - ) - - if not coarse_candidates: - self.logger.debug("No binary candidates found, falling back to standard search") - return self.search(query, source_path, options=options) - - self.logger.debug( - "Binary Cascade Stage 1 complete: %d candidates retrieved", - len(coarse_candidates), - ) - - # Stage 2: Dense vector fine ranking - self.logger.debug( - "Binary Cascade Stage 2: Dense reranking %d candidates to top-%d", - len(coarse_candidates), - k, - ) - - # Group candidates by index path for batch retrieval - candidates_by_index: Dict[Path, List[int]] = {} - for chunk_id, _, index_path in coarse_candidates: - if index_path not in candidates_by_index: - candidates_by_index[index_path] = [] - candidates_by_index[index_path].append(chunk_id) - - # Retrieve dense embeddings and compute cosine similarity - scored_results: List[Tuple[float, SearchResult]] = [] - import sqlite3 - dense_query_cache: Dict[Tuple[str, str, bool], "np.ndarray"] = {} - dense_query_errors: list[str] = [] - - for index_path, chunk_ids in candidates_by_index.items(): - try: - query_index_root = index_path if used_centralized else index_path.parent - query_dense = self._embed_dense_query( - query, - index_root=query_index_root, - query_cache=dense_query_cache, - ) - - # Collect valid rows and dense vectors for batch processing - valid_rows: List[Dict[str, Any]] = [] - dense_vectors: List["np.ndarray"] = [] - - if used_centralized: - # Centralized mode: index_path is actually index_root directory - # Dense embeddings are in per-directory _index.db files - # referenced by source_index_db in chunk_metadata - meta_db_path = index_path / VECTORS_META_DB_NAME - if not meta_db_path.exists(): - self.logger.debug( - "VectorMetadataStore not found at %s, skipping dense reranking", meta_db_path - ) - continue - - # Get chunk metadata with source_index_db references - meta_store = VectorMetadataStore(meta_db_path) - chunks_meta = meta_store.get_chunks_by_ids(chunk_ids) - - # Group chunks by source_index_db - chunks_by_source: Dict[str, List[Dict[str, Any]]] = {} - for chunk in chunks_meta: - source_db = chunk.get("source_index_db") - if source_db: - if source_db not in chunks_by_source: - chunks_by_source[source_db] = [] - chunks_by_source[source_db].append(chunk) - - # Retrieve dense embeddings from each source_index_db - for source_db, source_chunks in chunks_by_source.items(): - try: - source_chunk_ids = [c["chunk_id"] for c in source_chunks] - conn = sqlite3.connect(source_db) - conn.row_factory = sqlite3.Row - - placeholders = ",".join("?" * len(source_chunk_ids)) - # Try semantic_chunks first (newer schema), fall back to chunks - try: - rows = conn.execute( - f"SELECT id, embedding_dense FROM semantic_chunks WHERE id IN ({placeholders})", - source_chunk_ids - ).fetchall() - except sqlite3.OperationalError: - rows = conn.execute( - f"SELECT id, embedding_dense FROM chunks WHERE id IN ({placeholders})", - source_chunk_ids - ).fetchall() - conn.close() - - # Build dense vector lookup - dense_lookup = {row["id"]: row["embedding_dense"] for row in rows} - - # Process chunks with their embeddings - for chunk in source_chunks: - chunk_id = chunk["chunk_id"] - dense_bytes = dense_lookup.get(chunk_id) - if dense_bytes is not None: - valid_rows.append({ - "id": chunk_id, - "file_path": chunk["file_path"], - "content": chunk["content"], - }) - dense_vectors.append(np.frombuffer(dense_bytes, dtype=np.float32)) - except Exception as exc: - self.logger.debug( - "Failed to get dense embeddings from %s: %s", source_db, exc - ) - else: - # Per-directory mode: index_path is the _index.db file - conn = sqlite3.connect(str(index_path)) - conn.row_factory = sqlite3.Row - - placeholders = ",".join("?" * len(chunk_ids)) - rows = conn.execute( - f"SELECT id, file_path, content, embedding_dense FROM semantic_chunks WHERE id IN ({placeholders})", - chunk_ids - ).fetchall() - conn.close() - - for row in rows: - dense_bytes = row["embedding_dense"] - if dense_bytes is not None: - valid_rows.append(dict(row)) - dense_vectors.append(np.frombuffer(dense_bytes, dtype=np.float32)) - - # Skip if no dense embeddings found - if not dense_vectors: - continue - - # Stack into matrix for batch computation - doc_matrix = np.vstack(dense_vectors) - - # Batch compute cosine similarities - scores = self._compute_cosine_similarity_batch(query_dense, doc_matrix) - - # Create search results - for i, row in enumerate(valid_rows): - score = float(scores[i]) - excerpt = (row.get("content") or "")[:500] - result = SearchResult( - path=row.get("file_path") or "", - score=score, - excerpt=excerpt, - ) - scored_results.append((score, result)) - - except Exception as exc: - self.logger.debug( - "Dense reranking failed for %s: %s", index_path, exc - ) - stats.errors.append(f"Dense reranking failed for {index_path}: {exc}") - dense_query_errors.append(str(exc)) - - if not scored_results: - if dense_query_errors: - self.logger.warning( - "Failed to generate dense query embeddings for binary cascade: %s. " - "Using Hamming distance scores only.", - dense_query_errors[0], - ) - final_results = self._materialize_binary_candidates( - coarse_candidates[:k], - stats, - stage2_index_root=stage2_index_root, - ) - stats.files_matched = len(final_results) - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=final_results, - symbols=[], - stats=stats, - ) - - # Sort by score descending and deduplicate by path - scored_results.sort(key=lambda x: x[0], reverse=True) - - path_to_result: Dict[str, SearchResult] = {} - for score, result in scored_results: - if result.path not in path_to_result: - path_to_result[result.path] = result - - final_results = self._apply_default_path_penalties( - query, - list(path_to_result.values()), - )[:k] - - # Optional: grouping of similar results - if options.group_results: - from codexlens.search.ranking import group_similar_results - final_results = group_similar_results( - final_results, score_threshold_abs=options.grouping_threshold - ) - - stats.files_matched = len(final_results) - stats.time_ms = (time.time() - start_time) * 1000 - - self.logger.debug( - "Binary cascade search complete: %d results in %.2fms", - len(final_results), - stats.time_ms, - ) - - return ChainSearchResult( - query=query, - results=final_results, - symbols=[], - stats=stats, - ) - - def cascade_search( - self, - query: str, - source_path: Path, - k: int = 10, - coarse_k: int = 100, - options: Optional[SearchOptions] = None, - strategy: Optional[Literal["binary", "binary_rerank", "dense_rerank", "staged", "hybrid"]] = None, - ) -> ChainSearchResult: - """Unified cascade search entry point with strategy selection. - - Provides a single interface for cascade search with configurable strategy: - - "binary": Uses binary vector coarse ranking + dense fine ranking (fastest) - - "binary_rerank": Uses binary vector coarse ranking + cross-encoder reranking (best balance) - - "hybrid": Alias for "binary_rerank" (backward compat) - - "dense_rerank": Uses dense vector coarse ranking + cross-encoder reranking - - "staged": 4-stage pipeline: binary -> LSP expand -> clustering -> optional rerank - - The strategy is determined with the following priority: - 1. The `strategy` parameter (e.g., from CLI --cascade-strategy option) - 2. Config `cascade_strategy` setting from settings.json - 3. Default: "binary" - - Args: - query: Natural language or keyword query string - source_path: Starting directory path - k: Number of final results to return (default 10) - coarse_k: Number of coarse candidates from first stage (default 100) - options: Search configuration (uses defaults if None) - strategy: Cascade strategy - "binary", "binary_rerank", "dense_rerank", or "staged". - - Returns: - ChainSearchResult with reranked results and statistics - - Examples: - >>> engine = ChainSearchEngine(registry, mapper, config=config) - >>> # Use binary cascade (default, fastest) - >>> result = engine.cascade_search("auth", Path("D:/project")) - >>> # Use binary + cross-encoder (best balance of speed and quality) - >>> result = engine.cascade_search("auth", Path("D:/project"), strategy="binary_rerank") - >>> # Use 4-stage pipeline (binary + LSP expand + clustering + optional rerank) - >>> result = engine.cascade_search("auth", Path("D:/project"), strategy="staged") - """ - # Strategy priority: parameter > config > default - effective_strategy = strategy - valid_strategies = ("binary", "binary_rerank", "dense_rerank", "staged", "hybrid") - if effective_strategy is None: - # Not passed via parameter, check config - if self._config is not None: - config_strategy = getattr(self._config, "cascade_strategy", None) - if config_strategy in valid_strategies: - effective_strategy = config_strategy - - # If still not set, apply default - if effective_strategy not in valid_strategies: - effective_strategy = "binary" - - # Normalize backward-compat alias - if effective_strategy == "hybrid": - effective_strategy = "binary_rerank" - - if effective_strategy == "binary": - return self.binary_cascade_search(query, source_path, k, coarse_k, options) - elif effective_strategy == "binary_rerank": - return self.binary_rerank_cascade_search(query, source_path, k, coarse_k, options) - elif effective_strategy == "dense_rerank": - return self.dense_rerank_cascade_search(query, source_path, k, coarse_k, options) - elif effective_strategy == "staged": - return self.staged_cascade_search(query, source_path, k, coarse_k, options) - else: - return self.binary_cascade_search(query, source_path, k, coarse_k, options) - - def staged_cascade_search( - self, - query: str, - source_path: Path, - k: int = 10, - coarse_k: int = 100, - options: Optional[SearchOptions] = None, - ) -> ChainSearchResult: - """Execute 4-stage cascade search pipeline with binary, LSP expansion, clustering, and optional reranking. - - Staged cascade search process: - 1. Stage 1 (Binary Coarse): Fast binary vector search using Hamming distance - to quickly filter to coarse_k candidates (256-bit binary vectors) - 2. Stage 2 (LSP Expansion): Expand coarse candidates using GraphExpander to - include related symbols (definitions, references, callers/callees) - 3. Stage 3 (Clustering): Use configurable clustering strategy to group similar - results and select representative results from each cluster - 4. Stage 4 (Optional Rerank): If config.enable_staged_rerank is True, apply - cross-encoder reranking for final precision - - This approach combines the speed of binary search with graph-based context - expansion and diversity-preserving clustering for high-quality results. - - Performance characteristics: - - Stage 1: O(N) binary search with SIMD acceleration (~8ms) - - Stage 2: O(k * d) graph traversal where d is expansion depth - - Stage 3: O(n^2) clustering on expanded candidates - - Stage 4: Optional cross-encoder reranking (API call) - - Args: - query: Natural language or keyword query string - source_path: Starting directory path - k: Number of final results to return (default 10) - coarse_k: Number of coarse candidates from first stage (default 100) - options: Search configuration (uses defaults if None) - - Returns: - ChainSearchResult with per-stage statistics - - Examples: - >>> engine = ChainSearchEngine(registry, mapper, config=config) - >>> result = engine.staged_cascade_search( - ... "authentication handler", - ... Path("D:/project/src"), - ... k=10, - ... coarse_k=100 - ... ) - >>> for r in result.results: - ... print(f"{r.path}: {r.score:.3f}") - """ - if not NUMPY_AVAILABLE: - self.logger.warning( - "NumPy not available, falling back to standard search" - ) - return self.search(query, source_path, options=options) - - options = options or SearchOptions() - start_time = time.time() - stats = SearchStats() - - # Per-stage timing stats - stage_times: Dict[str, float] = {} - stage_counts: Dict[str, int] = {} - - # Use config defaults if available - if self._config is not None: - if hasattr(self._config, "cascade_coarse_k"): - coarse_k = coarse_k or self._config.cascade_coarse_k - if hasattr(self._config, "cascade_fine_k"): - k = k or self._config.cascade_fine_k - - # Step 1: Find starting index - start_index = self._find_start_index(source_path) - if not start_index: - self.logger.warning(f"No index found for {source_path}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Step 2: Collect all index paths - index_paths = self._collect_index_paths(start_index, options.depth) - stats.dirs_searched = len(index_paths) - - if not index_paths: - self.logger.warning(f"No indexes collected from {start_index}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # ========== Stage 1: Binary Coarse Search ========== - stage1_start = time.time() - coarse_results, index_root = self._stage1_binary_search( - query, - index_paths, - coarse_k, - stats, - index_root=start_index.parent, - ) - coarse_results = self._inject_query_feature_anchors( - query, - source_path, - options, - coarse_results, - limit=min(6, max(2, k)), - ) - stage_times["stage1_binary_ms"] = (time.time() - stage1_start) * 1000 - stage_counts["stage1_candidates"] = len(coarse_results) - stage_counts["stage1_feature_anchors"] = sum( - 1 - for result in coarse_results - if (result.metadata or {}).get("feature_query_anchor") - ) - - self.logger.debug( - "Staged Stage 1: Binary search found %d candidates in %.2fms", - len(coarse_results), stage_times["stage1_binary_ms"] - ) - - if not coarse_results: - # Keep the staged pipeline running even when Stage 1 yields no candidates. - # This makes "realtime LSP graph → clustering → rerank" comparable across queries. - self.logger.debug( - "No Stage 1 candidates found; seeding staged pipeline with FTS results" - ) - stage1_fallback_start = time.time() - try: - seed_opts = SearchOptions( - depth=options.depth, - max_workers=options.max_workers, - limit_per_dir=max(10, int(coarse_k)), - total_limit=int(coarse_k), - include_symbols=True, - enable_vector=False, - hybrid_mode=False, - enable_cascade=False, - ) - seed = self.search(query, source_path, options=seed_opts) - coarse_results = list(seed.results or [])[: int(coarse_k)] - stage_counts["stage1_fallback_used"] = 1 - except Exception as exc: - self.logger.debug("Stage 1 fallback seeding failed: %r", exc) - coarse_results = [] - - stage_times["stage1_fallback_search_ms"] = (time.time() - stage1_fallback_start) * 1000 - stage_counts["stage1_candidates"] = len(coarse_results) - - if not coarse_results: - return ChainSearchResult(query=query, results=[], symbols=[], stats=stats) - - # ========== Stage 2: LSP Graph Expansion ========== - stage2_start = time.time() - expanded_results = self._stage2_lsp_expand(coarse_results, index_root, query=query) - stage_times["stage2_expand_ms"] = (time.time() - stage2_start) * 1000 - stage_counts["stage2_expanded"] = len(expanded_results) - try: - stage2_unique_paths = len({(r.path or "").lower() for r in expanded_results if getattr(r, "path", None)}) - except Exception: - stage2_unique_paths = 0 - stage_counts["stage2_unique_paths"] = stage2_unique_paths - stage_counts["stage2_duplicate_paths"] = max(0, len(expanded_results) - stage2_unique_paths) - - self.logger.debug( - "Staged Stage 2: LSP expansion %d -> %d results in %.2fms", - len(coarse_results), len(expanded_results), stage_times["stage2_expand_ms"] - ) - - # ========== Stage 3: Clustering and Representative Selection ========== - stage3_start = time.time() - stage3_target_count = self._resolve_stage3_target_count( - k, - len(expanded_results), - ) - clustered_results = self._stage3_cluster_prune( - expanded_results, - stage3_target_count, - query=query, - ) - stage_times["stage3_cluster_ms"] = (time.time() - stage3_start) * 1000 - stage_counts["stage3_clustered"] = len(clustered_results) - stage_counts["stage3_target_count"] = stage3_target_count - if self._config is not None: - try: - stage_counts["stage3_strategy"] = str(getattr(self._config, "staged_clustering_strategy", "auto") or "auto") - except Exception: - pass - - self.logger.debug( - "Staged Stage 3: Clustering %d -> %d representatives in %.2fms", - len(expanded_results), len(clustered_results), stage_times["stage3_cluster_ms"] - ) - - # ========== Stage 4: Optional Cross-Encoder Reranking ========== - enable_rerank = False - if self._config is not None: - enable_rerank = getattr(self._config, "enable_staged_rerank", False) - - if enable_rerank: - stage4_start = time.time() - final_results = self._stage4_optional_rerank(query, clustered_results, k) - stage_times["stage4_rerank_ms"] = (time.time() - stage4_start) * 1000 - stage_counts["stage4_reranked"] = len(final_results) - - self.logger.debug( - "Staged Stage 4: Reranking %d -> %d results in %.2fms", - len(clustered_results), len(final_results), stage_times["stage4_rerank_ms"] - ) - else: - # Skip reranking, just take top-k by score - final_results = sorted( - clustered_results, key=lambda r: r.score, reverse=True - )[:k] - stage_counts["stage4_reranked"] = len(final_results) - - # Deduplicate by path (keep highest score) - path_to_result: Dict[str, SearchResult] = {} - for result in final_results: - if result.path not in path_to_result or result.score > path_to_result[result.path].score: - path_to_result[result.path] = result - - final_results = self._apply_default_path_penalties( - query, - list(path_to_result.values()), - )[:k] - - # Optional: grouping of similar results - if options.group_results: - from codexlens.search.ranking import group_similar_results - final_results = group_similar_results( - final_results, score_threshold_abs=options.grouping_threshold - ) - - stats.files_matched = len(final_results) - stats.time_ms = (time.time() - start_time) * 1000 - - # Add per-stage stats to errors field (as JSON for now, will be proper field later) - stage_stats_json = json.dumps({ - "stage_times": stage_times, - "stage_counts": stage_counts, - }) - stats.errors.append(f"STAGE_STATS:{stage_stats_json}") - - self.logger.debug( - "Staged cascade search complete: %d results in %.2fms " - "(stage1=%.1fms, stage2=%.1fms, stage3=%.1fms)", - len(final_results), - stats.time_ms, - stage_times.get("stage1_binary_ms", 0), - stage_times.get("stage2_expand_ms", 0), - stage_times.get("stage3_cluster_ms", 0), - ) - - return ChainSearchResult( - query=query, - results=final_results, - symbols=[], - stats=stats, - ) - - def _stage1_binary_search( - self, - query: str, - index_paths: List[Path], - coarse_k: int, - stats: SearchStats, - *, - index_root: Optional[Path] = None, - ) -> Tuple[List[SearchResult], Optional[Path]]: - """Stage 1: Binary vector coarse search using Hamming distance.""" - - coarse_candidates, _, using_dense_fallback, stage2_index_root = self._collect_binary_coarse_candidates( - query, - index_paths, - coarse_k, - stats, - index_root=index_root, - allow_dense_fallback=True, - ) - if not coarse_candidates: - return [], stage2_index_root - return self._materialize_binary_candidates( - coarse_candidates, - stats, - stage2_index_root=stage2_index_root, - using_dense_fallback=using_dense_fallback, - ), stage2_index_root - - def _stage2_lsp_expand( - self, - coarse_results: List[SearchResult], - index_root: Optional[Path], - query: Optional[str] = None, - ) -> List[SearchResult]: - """Stage 2: LSP/graph expansion for staged cascade. - - Supports two modes via Config.staged_stage2_mode: - - "precomputed" (default): GraphExpander over per-dir `graph_neighbors` table - - "realtime": on-demand graph expansion via live LSP servers (LspBridge + LspGraphBuilder) - - Args: - coarse_results: Results from Stage 1 binary search - index_root: Root path of the index (for graph database access) - - Returns: - Combined list of original results plus expanded related results - """ - if not coarse_results or index_root is None: - return coarse_results - - try: - mode = "precomputed" - if self._config is not None: - mode = (getattr(self._config, "staged_stage2_mode", "precomputed") or "precomputed").strip().lower() - - if mode in {"realtime", "live"}: - return self._stage2_realtime_lsp_expand( - coarse_results, - index_root=index_root, - query=query, - ) - - if mode == "static_global_graph": - return self._stage2_static_global_graph_expand(coarse_results, index_root=index_root) - - return self._stage2_precomputed_graph_expand(coarse_results, index_root=index_root) - - except ImportError as exc: - self.logger.debug("GraphExpander not available: %s", exc) - return coarse_results - except Exception as exc: - self.logger.debug("Stage 2 LSP expansion failed: %s", exc) - return coarse_results - - def _stage2_precomputed_graph_expand( - self, - coarse_results: List[SearchResult], - *, - index_root: Path, - ) -> List[SearchResult]: - """Stage 2 (precomputed): expand using GraphExpander over `graph_neighbors`.""" - from codexlens.search.graph_expander import GraphExpander - - depth = 2 - if self._config is not None: - depth = getattr( - self._config, - "staged_lsp_depth", - getattr(self._config, "graph_expansion_depth", 2), - ) - try: - depth = int(depth) - except Exception: - depth = 2 - - expander = GraphExpander(self.mapper, config=self._config) - - max_expand = min(10, len(coarse_results)) - max_related = 50 - - related_results = expander.expand( - coarse_results, - depth=depth, - max_expand=max_expand, - max_related=max_related, - ) - - if related_results: - self.logger.debug( - "Stage 2 (precomputed) expanded %d base results to %d related symbols", - len(coarse_results), len(related_results) - ) - - return self._combine_stage2_results(coarse_results, related_results) - - def _stage2_static_global_graph_expand( - self, - coarse_results: List[SearchResult], - *, - index_root: Path, - ) -> List[SearchResult]: - """Stage 2 (static_global_graph): expand using GlobalGraphExpander over global_relationships.""" - from codexlens.search.global_graph_expander import GlobalGraphExpander - - global_db_path = index_root / GlobalSymbolIndex.DEFAULT_DB_NAME - if not global_db_path.exists(): - self.logger.debug("Global symbol DB not found at %s, skipping static graph expansion", global_db_path) - return coarse_results - - project_id = 1 - try: - for p in self.registry.list_projects(): - if p.index_root.resolve() == index_root.resolve(): - project_id = p.id - break - except Exception: - pass - - global_index = GlobalSymbolIndex(global_db_path, project_id=project_id) - global_index.initialize() - - try: - expander = GlobalGraphExpander(global_index, config=self._config) - related_results = expander.expand( - coarse_results, - top_n=min(10, len(coarse_results)), - max_related=50, - ) - - if related_results: - self.logger.debug( - "Stage 2 (static_global_graph) expanded %d base results to %d related symbols", - len(coarse_results), len(related_results), - ) - - return self._combine_stage2_results(coarse_results, related_results) - finally: - global_index.close() - - def _stage2_realtime_lsp_expand( - self, - coarse_results: List[SearchResult], - *, - index_root: Path, - query: Optional[str] = None, - ) -> List[SearchResult]: - """Stage 2 (realtime): compute expansion graph via live LSP servers.""" - import asyncio - from concurrent.futures import ThreadPoolExecutor - - from codexlens.hybrid_search.data_structures import CodeSymbolNode, Range - from codexlens.lsp import LspBridge, LspGraphBuilder - - max_depth = 1 - timeout_s = 30.0 - max_nodes = 50 - max_seeds = 1 - max_concurrent = 2 - warmup_s = 3.0 - resolve_symbols = False - if self._config is not None: - max_depth = int( - getattr( - self._config, - "staged_realtime_lsp_depth", - getattr(self._config, "staged_lsp_depth", 1), - ) - or 1 - ) - timeout_s = float(getattr(self._config, "staged_realtime_lsp_timeout_s", 30.0) or 30.0) - max_nodes = int(getattr(self._config, "staged_realtime_lsp_max_nodes", 50) or 50) - warmup_s = float(getattr(self._config, "staged_realtime_lsp_warmup_s", 3.0) or 0.0) - max_seeds = int(getattr(self._config, "staged_realtime_lsp_max_seeds", 1) or 1) - max_concurrent = int(getattr(self._config, "staged_realtime_lsp_max_concurrent", 2) or 2) - resolve_symbols = bool(getattr(self._config, "staged_realtime_lsp_resolve_symbols", False)) - - try: - source_root = self.mapper.index_to_source(index_root) - except Exception: - source_root = Path(coarse_results[0].path).resolve().parent - - lsp_config_file = self._find_lsp_config_file(source_root) - workspace_root = Path(source_root).resolve() - - max_expand = min(max(1, max_seeds), len(coarse_results)) - seed_nodes: List[CodeSymbolNode] = [] - seed_ids: set[str] = set() - - selected_results = list(coarse_results) - if query: - import re - - terms = { - t.lower() - for t in re.findall(r"[A-Za-z_][A-Za-z0-9_]*", query) - if t - } - - def _priority(result: SearchResult) -> float: - sym = (result.symbol_name or "").strip().lower() - stem = Path(result.path).stem.lower() if result.path else "" - score = 0.0 - if sym and sym in terms: - score += 5.0 - if sym: - score += 2.0 - if stem and stem in terms: - score += 1.0 - if result.symbol_kind: - score += 0.5 - if result.start_line: - score += 0.2 - return score - - indexed = list(enumerate(selected_results)) - indexed.sort( - key=lambda pair: ( - _priority(pair[1]), - float(pair[1].score), - -pair[0], - ), - reverse=True, - ) - selected_results = [r for _, r in indexed] - else: - indexed = list(enumerate(selected_results)) - indexed.sort( - key=lambda pair: ( - 1.0 if pair[1].symbol_name else 0.0, - float(pair[1].score), - -pair[0], - ), - reverse=True, - ) - selected_results = [r for _, r in indexed] - - # Prefer symbol-definition seeds when possible (improves LSP reference/call-hierarchy results). - # - # NOTE: We avoid relying purely on the stored symbol index here because its ranges may be - # imprecise in some projects. Instead, we attempt a lightweight definition-line detection - # for query identifiers within the top coarse candidate files. - if query: - try: - import re - - terms_raw = [ - t for t in re.findall(r"[A-Za-z_][A-Za-z0-9_]*", query) if t - ] - stopwords = { - "class", "def", "function", "method", "import", "from", "return", - "async", "await", "public", "private", "protected", "static", - "const", "let", "var", "new", - } - candidate_terms = [ - t for t in terms_raw - if t.lower() not in stopwords and len(t) >= 3 - ] - - candidate_terms.sort(key=len, reverse=True) - - # Candidate files (best-first): de-dupe while preserving ordering. - candidate_files: List[str] = [] - seen_files: set[str] = set() - for r in selected_results: - if r.path and r.path not in seen_files: - seen_files.add(r.path) - candidate_files.append(r.path) - if len(candidate_files) >= 50: - break - - # Also consider files whose *names* match query identifiers (helps when coarse retrieval - # misses the defining file for a symbol like `Config`). - try: - if source_root and candidate_terms: - allow_suffix = {".py", ".ts", ".tsx", ".js", ".jsx"} - name_terms = [t.lower() for t in candidate_terms[:3]] - for dirpath, _, filenames in os.walk(source_root): - for filename in filenames: - suffix = Path(filename).suffix.lower() - if suffix not in allow_suffix: - continue - lowered = filename.lower() - if any(t in lowered for t in name_terms): - fp = str(Path(dirpath) / filename) - if fp not in seen_files: - seen_files.add(fp) - candidate_files.append(fp) - if len(candidate_files) >= 120: - break - except Exception: - pass - - for term in candidate_terms[:5]: - if len(seed_nodes) >= max_expand: - break - - escaped = re.escape(term) - py_class = re.compile(rf"^\s*class\s+{escaped}\b") - py_def = re.compile(rf"^\s*(?:async\s+)?def\s+{escaped}\b") - ts_class = re.compile(rf"^\s*(?:export\s+)?class\s+{escaped}\b") - ts_func = re.compile(rf"^\s*(?:export\s+)?(?:async\s+)?function\s+{escaped}\b") - - for file_path in candidate_files: - if len(seed_nodes) >= max_expand: - break - suffix = Path(file_path).suffix.lower() - if suffix not in {".py", ".ts", ".tsx", ".js", ".jsx"}: - continue - - try: - lines = Path(file_path).read_text(encoding="utf-8", errors="ignore").splitlines() - except Exception: - continue - - for i, line in enumerate(lines): - kind = None - if suffix == ".py": - if py_class.search(line): - kind = "class" - elif py_def.search(line): - kind = "function" - else: - if ts_class.search(line): - kind = "class" - elif ts_func.search(line): - kind = "function" - - if not kind: - continue - - start_line = i + 1 - idx = line.find(term) - if idx >= 0: - start_character = idx + 1 - else: - stripped = line.lstrip() - start_character = (len(line) - len(stripped)) + 1 if stripped else 1 - - node_id = f"{file_path}:{term}:{start_line}" - if node_id in seed_ids: - break - - seed_ids.add(node_id) - seed_nodes.append( - CodeSymbolNode( - id=node_id, - name=term, - kind=kind, - file_path=file_path, - range=Range( - start_line=start_line, - start_character=start_character, - end_line=start_line, - end_character=start_character, - ), - ) - ) - break - except Exception: - pass - - for seed in selected_results: - if len(seed_nodes) >= max_expand: - break - if not seed.path: - continue - name = seed.symbol_name or Path(seed.path).stem - kind = seed.symbol_kind or "unknown" - start_line = int(seed.start_line or 1) - end_line = int(seed.end_line or start_line) - start_character = 1 - try: - if start_line >= 1: - line_text = Path(seed.path).read_text(encoding="utf-8", errors="ignore").splitlines()[start_line - 1] - if seed.symbol_name: - idx = line_text.find(seed.symbol_name) - if idx >= 0: - start_character = idx + 1 # 1-based for StandaloneLspManager - else: - stripped = line_text.lstrip() - if stripped: - start_character = (len(line_text) - len(stripped)) + 1 - except Exception: - start_character = 1 - node_id = f"{seed.path}:{name}:{start_line}" - if node_id in seed_ids: - continue - seed_ids.add(node_id) - seed_nodes.append( - CodeSymbolNode( - id=node_id, - name=name, - kind=kind, - file_path=seed.path, - range=Range( - start_line=start_line, - start_character=start_character, - end_line=end_line, - end_character=start_character if end_line == start_line else 1, - ), - raw_code=seed.content or "", - docstring=seed.excerpt or "", - ) - ) - - if not seed_nodes: - return coarse_results - - effective_warmup_s = warmup_s - - async def expand_graph(bridge: LspBridge): - # Warm up analysis: open seed docs and wait a bit so references/call hierarchy are populated. - if effective_warmup_s > 0: - for seed in seed_nodes[:3]: - try: - await bridge.get_document_symbols(seed.file_path) - except Exception: - continue - try: - warmup_budget = min(effective_warmup_s, max(0.0, timeout_s * 0.1)) - await asyncio.sleep(min(warmup_budget, max(0.0, timeout_s - 0.5))) - except Exception: - pass - builder = LspGraphBuilder( - max_depth=max_depth, - max_nodes=max_nodes, - max_concurrent=max(1, max_concurrent), - resolve_symbols=resolve_symbols, - ) - return await builder.build_from_seeds(seed_nodes, bridge) - - try: - try: - asyncio.get_running_loop() - has_running_loop = True - except RuntimeError: - has_running_loop = False - - if has_running_loop: - with ThreadPoolExecutor(max_workers=1) as executor: - async def _expand_once(): - async with LspBridge( - workspace_root=str(workspace_root), - config_file=str(lsp_config_file) if lsp_config_file else None, - timeout=timeout_s, - ) as bridge: - return await expand_graph(bridge) - - def _run(): - return asyncio.run(asyncio.wait_for(_expand_once(), timeout=timeout_s)) - - graph = executor.submit(_run).result(timeout=timeout_s + 1.0) - else: - from codexlens.lsp.keepalive_bridge import KeepAliveKey, KeepAliveLspBridge - - key = KeepAliveKey( - workspace_root=str(workspace_root), - config_file=str(lsp_config_file) if lsp_config_file else None, - timeout=float(timeout_s), - ) - warm_id = (key.workspace_root, key.config_file) - with self._realtime_lsp_keepalive_lock: - if warm_id in self._realtime_lsp_warmed_ids: - effective_warmup_s = 0.0 - keepalive = self._realtime_lsp_keepalive - if keepalive is None or self._realtime_lsp_keepalive_key != key: - if keepalive is not None: - try: - keepalive.stop() - except Exception: - pass - keepalive = KeepAliveLspBridge( - workspace_root=key.workspace_root, - config_file=key.config_file, - timeout=key.timeout, - ) - self._realtime_lsp_keepalive = keepalive - self._realtime_lsp_keepalive_key = key - - graph = keepalive.run(expand_graph, timeout=timeout_s) - with self._realtime_lsp_keepalive_lock: - self._realtime_lsp_warmed_ids.add(warm_id) - except Exception as exc: - self.logger.debug("Stage 2 (realtime) expansion failed: %r", exc) - return coarse_results - - try: - node_count = len(getattr(graph, "nodes", {}) or {}) - edge_count = len(getattr(graph, "edges", []) or []) - except Exception: - node_count, edge_count = 0, 0 - self.logger.debug( - "Stage 2 (realtime) graph built: seeds=%d nodes=%d edges=%d", - len(seed_nodes), - node_count, - edge_count, - ) - - related_results: List[SearchResult] = [] - for node_id, node in getattr(graph, "nodes", {}).items(): - if node_id in seed_ids or getattr(node, "id", "") in seed_ids: - continue - - try: - start_line = int(getattr(node.range, "start_line", 1) or 1) - end_line = int(getattr(node.range, "end_line", start_line) or start_line) - except Exception: - start_line, end_line = 1, 1 - - related_results.append( - SearchResult( - path=node.file_path, - score=0.5, - excerpt=None, - content=getattr(node, "raw_code", "") or None, - symbol_name=node.name, - symbol_kind=node.kind, - start_line=start_line, - end_line=end_line, - metadata={"stage2_mode": "realtime", "lsp_node_id": node_id}, - ) - ) - - if related_results: - self.logger.debug( - "Stage 2 (realtime) expanded %d base results to %d related symbols", - len(coarse_results), len(related_results) - ) - - return self._combine_stage2_results(coarse_results, related_results) - - def _combine_stage2_results( - self, - coarse_results: List[SearchResult], - related_results: List[SearchResult], - ) -> List[SearchResult]: - combined = list(coarse_results) - seen_keys = {(r.path, r.symbol_name, r.start_line) for r in coarse_results} - - for related in related_results: - key = (related.path, related.symbol_name, related.start_line) - if key not in seen_keys: - seen_keys.add(key) - combined.append(related) - - return combined - - def _collect_query_feature_anchor_results( - self, - query: str, - source_path: Path, - options: SearchOptions, - *, - limit: int, - ) -> List[SearchResult]: - """Collect small lexical anchor sets for explicit file/feature hints.""" - if limit <= 0: - return [] - - from codexlens.search.ranking import ( - QueryIntent, - _path_topic_tokens, - detect_query_intent, - extract_explicit_path_hints, - is_auxiliary_reference_path, - is_generated_artifact_path, - is_test_file, - query_targets_auxiliary_files, - query_targets_generated_files, - query_targets_test_files, - ) - - explicit_hints = extract_explicit_path_hints(query) - if not explicit_hints: - return [] - skip_test_files = query_targets_test_files(query) - skip_generated_files = query_targets_generated_files(query) - skip_auxiliary_files = query_targets_auxiliary_files(query) - - anchor_limit = max(1, int(limit)) - per_hint_limit = max(2, min(6, anchor_limit)) - seed_opts = SearchOptions( - depth=options.depth, - max_workers=options.max_workers, - limit_per_dir=max(10, per_hint_limit), - total_limit=max(anchor_limit, per_hint_limit * 2), - include_symbols=False, - include_semantic=False, - files_only=False, - code_only=options.code_only, - exclude_extensions=list(options.exclude_extensions or []), - enable_vector=False, - hybrid_mode=False, - pure_vector=False, - enable_cascade=False, - inject_feature_anchors=False, - ) - - anchors_by_path: Dict[str, SearchResult] = {} - for hint_tokens in explicit_hints: - hint_query = " ".join(hint_tokens) - try: - seed_result = self.search(hint_query, source_path, options=seed_opts) - except Exception as exc: - self.logger.debug( - "Feature anchor search failed for %r: %s", - hint_query, - exc, - ) - continue - - for candidate in seed_result.results: - _, basename_tokens = _path_topic_tokens(candidate.path) - if not basename_tokens or not all(token in basename_tokens for token in hint_tokens): - continue - if not skip_test_files and is_test_file(candidate.path): - continue - if not skip_generated_files and is_generated_artifact_path(candidate.path): - continue - if not skip_auxiliary_files and is_auxiliary_reference_path(candidate.path): - continue - metadata = { - **(candidate.metadata or {}), - "feature_query_anchor": True, - "feature_query_hint": hint_query, - "feature_query_hint_tokens": list(hint_tokens), - } - anchor = candidate.model_copy( - deep=True, - update={"metadata": metadata}, - ) - existing = anchors_by_path.get(anchor.path) - if existing is None or float(anchor.score) > float(existing.score): - anchors_by_path[anchor.path] = anchor - if len(anchors_by_path) >= anchor_limit: - break - if len(anchors_by_path) >= anchor_limit: - break - - query_intent = detect_query_intent(query) - if not anchors_by_path and query_intent in {QueryIntent.KEYWORD, QueryIntent.MIXED}: - lexical_query = (query or "").strip() - if lexical_query: - try: - seed_result = self.search(lexical_query, source_path, options=seed_opts) - except Exception as exc: - self.logger.debug( - "Lexical feature anchor search failed for %r: %s", - lexical_query, - exc, - ) - else: - for candidate in seed_result.results: - if not skip_test_files and is_test_file(candidate.path): - continue - if not skip_generated_files and is_generated_artifact_path(candidate.path): - continue - if not skip_auxiliary_files and is_auxiliary_reference_path(candidate.path): - continue - metadata = { - **(candidate.metadata or {}), - "feature_query_anchor": True, - "feature_query_hint": lexical_query, - "feature_query_hint_tokens": [], - "feature_query_seed_kind": "lexical_query", - } - anchor = candidate.model_copy( - deep=True, - update={"metadata": metadata}, - ) - existing = anchors_by_path.get(anchor.path) - if existing is None or float(anchor.score) > float(existing.score): - anchors_by_path[anchor.path] = anchor - if len(anchors_by_path) >= anchor_limit: - break - - return sorted( - anchors_by_path.values(), - key=lambda result: result.score, - reverse=True, - )[:anchor_limit] - - def _merge_query_feature_anchor_results( - self, - base_results: List[SearchResult], - anchor_results: List[SearchResult], - ) -> List[SearchResult]: - """Merge explicit feature anchors into coarse candidates with comparable scores.""" - if not anchor_results: - return sorted(base_results, key=lambda result: result.score, reverse=True) - - merged: Dict[str, SearchResult] = {result.path: result for result in base_results} - base_sorted = sorted(base_results, key=lambda result: result.score, reverse=True) - base_max = float(base_sorted[0].score) if base_sorted else 1.0 - if base_sorted: - cutoff_index = min(len(base_sorted) - 1, max(0, min(4, len(base_sorted) - 1))) - anchor_floor = float(base_sorted[cutoff_index].score) - else: - anchor_floor = base_max - if anchor_floor <= 0: - anchor_floor = max(base_max * 0.85, 0.01) - - for index, anchor in enumerate(anchor_results): - target_score = max( - anchor_floor, - base_max * max(0.75, 0.92 - (0.03 * index)), - 0.01, - ) - existing = merged.get(anchor.path) - existing_metadata = existing.metadata or {} if existing is not None else {} - metadata = { - **existing_metadata, - **(anchor.metadata or {}), - "feature_query_anchor": True, - } - if existing is not None: - target_score = max(float(existing.score), target_score) - merged[anchor.path] = existing.model_copy( - deep=True, - update={ - "score": target_score, - "metadata": metadata, - }, - ) - else: - merged[anchor.path] = anchor.model_copy( - deep=True, - update={ - "score": target_score, - "metadata": metadata, - }, - ) - - return sorted(merged.values(), key=lambda result: result.score, reverse=True) - - def _inject_query_feature_anchors( - self, - query: str, - source_path: Path, - options: SearchOptions, - base_results: List[SearchResult], - *, - limit: int, - ) -> List[SearchResult]: - """Inject explicit file/feature anchors into coarse candidate sets.""" - anchor_results = self._collect_query_feature_anchor_results( - query, - source_path, - options, - limit=limit, - ) - return self._merge_query_feature_anchor_results(base_results, anchor_results) - - @staticmethod - def _combine_stage3_anchor_results( - anchor_results: List[SearchResult], - clustered_results: List[SearchResult], - *, - target_count: int, - ) -> List[SearchResult]: - """Combine preserved query anchors with Stage 3 representatives.""" - if target_count <= 0: - return [] - merged: List[SearchResult] = [] - seen: set[tuple[str, Optional[str], Optional[int]]] = set() - for result in [*anchor_results, *clustered_results]: - key = (result.path, result.symbol_name, result.start_line) - if key in seen: - continue - seen.add(key) - merged.append(result) - if len(merged) >= target_count: - break - return merged - - def _select_stage3_query_anchor_results( - self, - query: str, - expanded_results: List[SearchResult], - *, - limit: int, - ) -> List[SearchResult]: - """Select a small number of explicit feature anchors to preserve through clustering.""" - if limit <= 0 or not expanded_results: - return [] - - ranked_results = self._apply_default_path_penalties(query, expanded_results) - anchors: List[SearchResult] = [] - seen: set[tuple[str, Optional[str], Optional[int]]] = set() - for result in ranked_results: - metadata = result.metadata or {} - if not metadata.get("feature_query_anchor"): - continue - key = (result.path, result.symbol_name, result.start_line) - if key in seen: - continue - seen.add(key) - anchors.append(result) - if len(anchors) >= limit: - break - return anchors - - def _find_lsp_workspace_root(self, start_path: Path) -> Path: - """Best-effort workspace root selection for LSP initialization. - - Many language servers (e.g. Pyright) use workspace-relative include/exclude - patterns, so using a deep subdir (like "src") as root can break reference - and call-hierarchy queries. - """ - start = Path(start_path).resolve() - if start.is_file(): - start = start.parent - - # Prefer an explicit LSP config file in the workspace. - for current in [start, *list(start.parents)]: - try: - if (current / "lsp-servers.json").is_file(): - return current - except OSError: - continue - - # Fallback heuristics for project root markers. - for current in [start, *list(start.parents)]: - try: - if (current / ".git").exists() or (current / "pyproject.toml").is_file(): - return current - except OSError: - continue - - return start - - def _find_lsp_config_file(self, start_path: Path) -> Optional[Path]: - """Find a lsp-servers.json by walking up from start_path.""" - start = Path(start_path).resolve() - if start.is_file(): - start = start.parent - - for current in [start, *list(start.parents)]: - try: - candidate = current / "lsp-servers.json" - if candidate.is_file(): - return candidate - except OSError: - continue - return None - - def _stage3_cluster_prune( - self, - expanded_results: List[SearchResult], - target_count: int, - query: Optional[str] = None, - ) -> List[SearchResult]: - """Stage 3: Cluster expanded results and select representatives. - - Uses the extensible clustering infrastructure from codexlens.search.clustering - to group similar results and select the best representative from each cluster. - - Args: - expanded_results: Results from Stage 2 expansion - target_count: Target number of representative results - - Returns: - List of representative results (one per cluster) - """ - if not expanded_results: - return [] - - original_target_count = target_count - anchor_results: List[SearchResult] = [] - if query: - anchor_results = self._select_stage3_query_anchor_results( - query, - expanded_results, - limit=min(4, max(1, original_target_count // 4)), - ) - if anchor_results: - anchor_keys = { - (result.path, result.symbol_name, result.start_line) - for result in anchor_results - } - expanded_results = [ - result - for result in expanded_results - if (result.path, result.symbol_name, result.start_line) not in anchor_keys - ] - target_count = max(0, original_target_count - len(anchor_results)) - if target_count <= 0: - return anchor_results[:original_target_count] - - if not expanded_results: - return self._combine_stage3_anchor_results( - anchor_results, - [], - target_count=original_target_count, - ) - - # If few results, skip clustering - if len(expanded_results) <= target_count: - return self._combine_stage3_anchor_results( - anchor_results, - expanded_results, - target_count=original_target_count, - ) - - strategy_name = "auto" - if self._config is not None: - strategy_name = getattr(self._config, "staged_clustering_strategy", "auto") or "auto" - strategy_name = str(strategy_name).strip().lower() - - if strategy_name in {"noop", "none", "off"}: - return self._combine_stage3_anchor_results( - anchor_results, - sorted(expanded_results, key=lambda r: r.score, reverse=True)[:target_count], - target_count=original_target_count, - ) - - if strategy_name in {"score", "top", "rank"}: - return self._combine_stage3_anchor_results( - anchor_results, - sorted(expanded_results, key=lambda r: r.score, reverse=True)[:target_count], - target_count=original_target_count, - ) - - if strategy_name in {"path", "file"}: - best_by_path: Dict[str, SearchResult] = {} - for r in expanded_results: - if not r.path: - continue - key = str(r.path).lower() - if key not in best_by_path or r.score > best_by_path[key].score: - best_by_path[key] = r - candidates = list(best_by_path.values()) or expanded_results - candidates.sort(key=lambda r: r.score, reverse=True) - return self._combine_stage3_anchor_results( - anchor_results, - candidates[:target_count], - target_count=original_target_count, - ) - - if strategy_name in {"dir_rr", "rr_dir", "round_robin_dir"}: - results_sorted = sorted(expanded_results, key=lambda r: r.score, reverse=True) - buckets: Dict[str, List[SearchResult]] = {} - dir_order: List[str] = [] - for r in results_sorted: - try: - d = str(Path(r.path).parent).lower() - except Exception: - d = "" - if d not in buckets: - buckets[d] = [] - dir_order.append(d) - buckets[d].append(r) - - out: List[SearchResult] = [] - while len(out) < target_count: - progressed = False - for d in dir_order: - if not buckets.get(d): - continue - out.append(buckets[d].pop(0)) - progressed = True - if len(out) >= target_count: - break - if not progressed: - break - return self._combine_stage3_anchor_results( - anchor_results, - out, - target_count=original_target_count, - ) - - try: - from codexlens.search.clustering import ( - ClusteringConfig, - get_strategy, - ) - - # Get clustering config from config - strategy_name = "auto" - min_cluster_size = 3 - - if self._config is not None: - strategy_name = getattr(self._config, "staged_clustering_strategy", "auto") - min_cluster_size = getattr(self._config, "staged_clustering_min_size", 3) - - # Get embeddings for clustering - # Try to get dense embeddings from results' content - embeddings = self._get_embeddings_for_clustering(expanded_results) - - if embeddings is None or len(embeddings) == 0: - # No embeddings available, fall back to score-based selection - self.logger.debug("No embeddings for clustering, using score-based selection") - return self._combine_stage3_anchor_results( - anchor_results, - sorted(expanded_results, key=lambda r: r.score, reverse=True)[:target_count], - target_count=original_target_count, - ) - - # Create clustering config - config = ClusteringConfig( - min_cluster_size=min(min_cluster_size, max(2, len(expanded_results) // 5)), - min_samples=2, - metric="cosine", - ) - - # Get strategy with fallback - strategy = get_strategy(strategy_name, config, fallback=True) - - # Cluster and select representatives - representatives = strategy.fit_predict(embeddings, expanded_results) - - self.logger.debug( - "Stage 3 clustered %d results into %d representatives using %s", - len(expanded_results), len(representatives), type(strategy).__name__ - ) - - # If clustering returned too few, supplement with top-scored unclustered - if len(representatives) < target_count: - rep_paths = {r.path for r in representatives} - remaining = [r for r in expanded_results if r.path not in rep_paths] - remaining_sorted = sorted(remaining, key=lambda r: r.score, reverse=True) - representatives.extend(remaining_sorted[:target_count - len(representatives)]) - - return self._combine_stage3_anchor_results( - anchor_results, - representatives[:target_count], - target_count=original_target_count, - ) - - except ImportError as exc: - self.logger.debug("Clustering not available: %s", exc) - return self._combine_stage3_anchor_results( - anchor_results, - sorted(expanded_results, key=lambda r: r.score, reverse=True)[:target_count], - target_count=original_target_count, - ) - except Exception as exc: - self.logger.debug("Stage 3 clustering failed: %s", exc) - return self._combine_stage3_anchor_results( - anchor_results, - sorted(expanded_results, key=lambda r: r.score, reverse=True)[:target_count], - target_count=original_target_count, - ) - - def _stage4_optional_rerank( - self, - query: str, - clustered_results: List[SearchResult], - k: int, - ) -> List[SearchResult]: - """Stage 4: Optional cross-encoder reranking. - - Applies cross-encoder reranking if enabled in config. - - Args: - query: Search query string - clustered_results: Results from Stage 3 clustering - k: Requested final result count before downstream path penalties - - Returns: - Reranked results sorted by cross-encoder score. This can exceed the - requested final ``k`` so the caller can still demote noisy test or - generated hits before applying the final trim. - """ - if not clustered_results: - return [] - - rerank_limit = self._resolve_rerank_candidate_limit( - k, - len(clustered_results), - ) - return self._cross_encoder_rerank(query, clustered_results, rerank_limit) - - def _get_embeddings_for_clustering( - self, - results: List[SearchResult], - ) -> Optional["np.ndarray"]: - """Get dense embeddings for clustering results. - - Tries to generate embeddings from result content for clustering. - - Args: - results: List of SearchResult objects - - Returns: - NumPy array of embeddings or None if not available - """ - if not NUMPY_AVAILABLE: - return None - - if not results: - return None - - try: - from codexlens.semantic.factory import get_embedder - - # Get embedding settings from config - embedding_backend = "fastembed" - embedding_model = "code" - use_gpu = True - - if self._config is not None: - embedding_backend = getattr(self._config, "embedding_backend", "fastembed") - embedding_model = getattr(self._config, "embedding_model", "code") - use_gpu = getattr(self._config, "embedding_use_gpu", True) - - # Create embedder - if embedding_backend == "litellm": - embedder = get_embedder(backend="litellm", model=embedding_model) - else: - embedder = get_embedder(backend="fastembed", profile=embedding_model, use_gpu=use_gpu) - - # Extract text content from results - texts = [] - for result in results: - # Use content if available, otherwise use excerpt - text = result.content or result.excerpt or "" - if not text and result.path: - text = result.path - texts.append(text[:2000]) # Limit text length - - # Generate embeddings - embeddings = embedder.embed_to_numpy(texts) - return embeddings - - except ImportError as exc: - self.logger.debug("Embedder not available for clustering: %s", exc) - return None - except Exception as exc: - self.logger.debug("Failed to generate embeddings for clustering: %s", exc) - return None - - def binary_rerank_cascade_search( - self, - query: str, - source_path: Path, - k: int = 10, - coarse_k: int = 100, - options: Optional[SearchOptions] = None, - ) -> ChainSearchResult: - """Execute binary cascade search with cross-encoder reranking. - - Combines the speed of binary vector coarse search with the quality of - cross-encoder reranking for the best balance of speed and accuracy. - - Binary + Reranker cascade process: - 1. Stage 1 (Coarse): Fast binary vector search using Hamming distance - to quickly filter to coarse_k candidates (256-dim binary, 32 bytes/vector) - 2. Stage 2 (Fine): Cross-encoder reranking for precise semantic ranking - of candidates using query-document attention - - This approach is typically faster than binary_cascade_search while - achieving similar or better quality through cross-encoder reranking. - - Performance characteristics: - - Binary search: O(N) with SIMD-accelerated XOR + popcount (~8ms) - - Cross-encoder: Applied to top coarse_k candidates (~15-20s for API) - - Total: Faster coarse + high-quality fine = best balance - - Args: - query: Natural language or keyword query string - source_path: Starting directory path - k: Number of final results to return (default 10) - coarse_k: Number of coarse candidates from first stage (default 100) - options: Search configuration (uses defaults if None) - - Returns: - ChainSearchResult with cross-encoder reranked results and statistics - - Examples: - >>> engine = ChainSearchEngine(registry, mapper, config=config) - >>> result = engine.binary_rerank_cascade_search( - ... "how to authenticate users", - ... Path("D:/project/src"), - ... k=10, - ... coarse_k=100 - ... ) - >>> for r in result.results: - ... print(f"{r.path}: {r.score:.3f}") - """ - if not NUMPY_AVAILABLE: - self.logger.warning( - "NumPy not available, falling back to standard search" - ) - return self.search(query, source_path, options=options) - - options = options or SearchOptions() - start_time = time.time() - stats = SearchStats() - - # Use config defaults if available - if self._config is not None: - if hasattr(self._config, "cascade_coarse_k"): - coarse_k = coarse_k or self._config.cascade_coarse_k - if hasattr(self._config, "cascade_fine_k"): - k = k or self._config.cascade_fine_k - - # Step 1: Find starting index - start_index = self._find_start_index(source_path) - if not start_index: - self.logger.warning(f"No index found for {source_path}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Step 2: Collect all index paths - index_paths = self._collect_index_paths(start_index, options.depth) - stats.dirs_searched = len(index_paths) - - if not index_paths: - self.logger.warning(f"No indexes collected from {start_index}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Step 4: Binary coarse search (same as binary_cascade_search) - binary_coarse_time = time.time() - coarse_candidates, _, _, stage2_index_root = self._collect_binary_coarse_candidates( - query, - index_paths, - coarse_k, - stats, - index_root=index_paths[0].parent if index_paths else None, - ) - - if not coarse_candidates: - self.logger.info("No binary candidates found, falling back to standard search for reranking") - # Fall back to standard search which uses FTS+Vector - return self.search(query, source_path, options=options) - - # Sort by Hamming distance and take top coarse_k - coarse_candidates.sort(key=lambda x: x[1]) - coarse_candidates = coarse_candidates[:coarse_k] - - self.logger.debug( - "Binary coarse search: %d candidates in %.2fms", - len(coarse_candidates), (time.time() - binary_coarse_time) * 1000 - ) - - coarse_results = self._materialize_binary_candidates( - coarse_candidates, - stats, - stage2_index_root=stage2_index_root, - ) - - if not coarse_results: - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, results=[], symbols=[], stats=stats - ) - - coarse_results = self._inject_query_feature_anchors( - query, - source_path, - options, - coarse_results, - limit=min(6, max(2, k)), - ) - - self.logger.debug( - "Retrieved %d chunks for cross-encoder reranking", len(coarse_results) - ) - - # Step 6: Cross-encoder reranking - rerank_time = time.time() - rerank_limit = self._resolve_rerank_candidate_limit(k, len(coarse_results)) - reranked_results = self._cross_encoder_rerank( - query, - coarse_results, - top_k=rerank_limit, - ) - - self.logger.debug( - "Cross-encoder reranking: %d results in %.2fms", - len(reranked_results), (time.time() - rerank_time) * 1000 - ) - - # Deduplicate by path (keep highest score) - path_to_result: Dict[str, SearchResult] = {} - for result in reranked_results: - if result.path not in path_to_result or result.score > path_to_result[result.path].score: - path_to_result[result.path] = result - - final_results = self._apply_default_path_penalties( - query, - list(path_to_result.values()), - )[:k] - - stats.files_matched = len(final_results) - stats.time_ms = (time.time() - start_time) * 1000 - - self.logger.debug( - "Binary+Rerank cascade search complete: %d results in %.2fms", - len(final_results), - stats.time_ms, - ) - - return ChainSearchResult( - query=query, - results=final_results, - symbols=[], - stats=stats, - ) - - def dense_rerank_cascade_search( - self, - query: str, - source_path: Path, - k: int = 10, - coarse_k: int = 100, - options: Optional[SearchOptions] = None, - ) -> ChainSearchResult: - """Execute dense cascade search with cross-encoder reranking. - - Combines dense vector coarse search (HNSW) with cross-encoder reranking - for comparison with binary_rerank strategy. - - Dense + Reranker cascade process: - 1. Stage 1 (Coarse): Dense vector search using HNSW (cosine similarity) - to get coarse_k candidates (2048-dim float32) - 2. Stage 2 (Fine): Cross-encoder reranking for precise semantic ranking - - Args: - query: Natural language or keyword query string - source_path: Starting directory path - k: Number of final results to return (default 10) - coarse_k: Number of coarse candidates from first stage (default 100) - options: Search configuration (uses defaults if None) - - Returns: - ChainSearchResult with cross-encoder reranked results and statistics - """ - options = options or SearchOptions() - - if query_prefers_lexical_search(query): - self.logger.debug( - "Dense rerank shortcut: using lexical search for lexical-priority query %r", - query, - ) - lexical_options = SearchOptions( - depth=options.depth, - max_workers=options.max_workers, - limit_per_dir=max(options.limit_per_dir, max(10, k)), - total_limit=max(options.total_limit, max(20, k * 4)), - offset=options.offset, - include_symbols=False, - files_only=options.files_only, - include_semantic=False, - code_only=options.code_only, - exclude_extensions=list(options.exclude_extensions or []), - hybrid_mode=False, - enable_fuzzy=True, - enable_vector=False, - pure_vector=False, - enable_cascade=False, - hybrid_weights=None, - group_results=options.group_results, - grouping_threshold=options.grouping_threshold, - inject_feature_anchors=options.inject_feature_anchors, - ) - lexical_result = self.search(query, source_path, options=lexical_options) - return ChainSearchResult( - query=query, - results=lexical_result.results, - related_results=lexical_result.related_results, - symbols=[], - stats=lexical_result.stats, - ) - - if not NUMPY_AVAILABLE: - self.logger.warning( - "NumPy not available, falling back to standard search" - ) - return self.search(query, source_path, options=options) - start_time = time.time() - stats = SearchStats() - - # Use config defaults if available - if self._config is not None: - if hasattr(self._config, "cascade_coarse_k"): - coarse_k = coarse_k or self._config.cascade_coarse_k - if hasattr(self._config, "cascade_fine_k"): - k = k or self._config.cascade_fine_k - - # Step 1: Find starting index - start_index = self._find_start_index(source_path) - if not start_index: - self.logger.warning(f"No index found for {source_path}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Step 2: Collect all index paths - index_paths = self._collect_index_paths(start_index, options.depth) - stats.dirs_searched = len(index_paths) - - if not index_paths: - self.logger.warning(f"No indexes collected from {start_index}") - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, - results=[], - symbols=[], - stats=stats - ) - - # Step 3-5: Group child indexes by centralized dense vector root and search each root. - dense_coarse_time = time.time() - coarse_candidates: List[Tuple[int, float, Path]] = [] # (chunk_id, distance, index_path) - central_index_roots: Dict[Path, Path] = {} - dense_root_groups, dense_fallback_index_paths = self._group_index_paths_by_dense_root(index_paths) - dense_query_cache: Dict[Tuple[str, str, bool], "np.ndarray"] = {} - try: - from codexlens.semantic.ann_index import ANNIndex - - dense_candidate_groups: List[List[Tuple[int, float, Path]]] = [] - dense_roots_by_settings = self._group_dense_roots_by_embedding_settings( - dense_root_groups - ) - if len(dense_roots_by_settings) > 1: - self.logger.debug( - "Dense coarse search detected %d embedding setting groups; interleaving candidates across groups", - len(dense_roots_by_settings), - ) - - for dense_roots in dense_roots_by_settings.values(): - group_candidates: List[Tuple[int, float, Path]] = [] - for dense_root in dense_roots: - try: - query_dense = self._embed_dense_query( - query, - index_root=dense_root, - query_cache=dense_query_cache, - ) - ann_index = self._get_cached_centralized_dense_index( - dense_root, - int(query_dense.shape[0]), - ) - if ann_index is None: - continue - - ids, distances = ann_index.search(query_dense, top_k=coarse_k) - central_index_db = dense_root / "_index.db" - central_index_roots[central_index_db] = dense_root - for chunk_id, dist in zip(ids, distances): - group_candidates.append((chunk_id, dist, central_index_db)) - if ids: - self.logger.debug( - "Centralized dense search: %d candidates from %s", - len(ids), - dense_root / VECTORS_HNSW_NAME, - ) - except Exception as exc: - self.logger.debug( - "Centralized dense search failed for %s: %s", - dense_root, - exc, - ) - if group_candidates: - dense_candidate_groups.append(group_candidates) - - coarse_candidates = self._interleave_dense_candidate_groups( - dense_candidate_groups, - coarse_k, - ) - - if not coarse_candidates: - fallback_index_paths = dense_fallback_index_paths if dense_root_groups else index_paths - fallback_candidate_groups: List[List[Tuple[int, float, Path]]] = [] - fallback_index_groups = self._group_dense_index_paths_by_embedding_settings( - fallback_index_paths - ) - if len(fallback_index_groups) > 1: - self.logger.debug( - "Legacy dense fallback detected %d embedding setting groups; interleaving candidates across groups", - len(fallback_index_groups), - ) - for grouped_index_paths in fallback_index_groups.values(): - group_candidates: List[Tuple[int, float, Path]] = [] - for index_path in grouped_index_paths: - try: - query_dense = self._embed_dense_query( - query, - index_root=index_path.parent, - query_cache=dense_query_cache, - ) - ann_index = self._get_cached_legacy_dense_index( - index_path, - int(query_dense.shape[0]), - ) - if ann_index is None: - continue - - ids, distances = ann_index.search(query_dense, top_k=coarse_k) - for chunk_id, dist in zip(ids, distances): - group_candidates.append((chunk_id, dist, index_path)) - except Exception as exc: - self.logger.debug( - "Dense search failed for %s: %s", index_path, exc - ) - if group_candidates: - fallback_candidate_groups.append(group_candidates) - - coarse_candidates = self._interleave_dense_candidate_groups( - fallback_candidate_groups, - coarse_k, - ) - except Exception as exc: - self.logger.warning(f"Failed to prepare dense coarse search: {exc}") - return self.search(query, source_path, options=options) - - if not coarse_candidates: - self.logger.info("No dense candidates found, falling back to standard search") - return self.search(query, source_path, options=options) - - self.logger.debug( - "Dense coarse search: %d candidates in %.2fms", - len(coarse_candidates), (time.time() - dense_coarse_time) * 1000 - ) - - # Step 6: Build SearchResult objects for cross-encoder reranking - candidates_by_index: Dict[Path, List[int]] = {} - for chunk_id, distance, index_path in coarse_candidates: - if index_path not in candidates_by_index: - candidates_by_index[index_path] = [] - candidates_by_index[index_path].append(chunk_id) - - # Retrieve chunk content for reranking - import sqlite3 - coarse_results: List[SearchResult] = [] - - for index_path, chunk_ids in candidates_by_index.items(): - try: - central_root = central_index_roots.get(index_path) - if central_root is not None: - # Use centralized metadata from _vectors_meta.db - meta_db_path = central_root / "_vectors_meta.db" - if meta_db_path.exists(): - conn = sqlite3.connect(str(meta_db_path)) - conn.row_factory = sqlite3.Row - placeholders = ",".join("?" * len(chunk_ids)) - cursor = conn.execute( - f""" - SELECT chunk_id, file_path, content, start_line, end_line - FROM chunk_metadata - WHERE chunk_id IN ({placeholders}) - """, - chunk_ids - ) - chunks_data = [ - { - "id": row["chunk_id"], - "file_path": row["file_path"], - "content": row["content"], - "metadata": json.dumps({ - "start_line": row["start_line"], - "end_line": row["end_line"] - }), - "category": "code" if row["file_path"].endswith(('.py', '.ts', '.js', '.java', '.go', '.rs', '.cpp', '.c')) else "doc", - } - for row in cursor.fetchall() - ] - conn.close() - else: - chunks_data = [] - else: - # Fall back to per-directory semantic_chunks table - conn = sqlite3.connect(str(index_path)) - conn.row_factory = sqlite3.Row - placeholders = ",".join("?" * len(chunk_ids)) - cursor = conn.execute( - f""" - SELECT id, file_path, content, metadata, category - FROM semantic_chunks - WHERE id IN ({placeholders}) - """, - chunk_ids - ) - chunks_data = [ - { - "id": row["id"], - "file_path": row["file_path"], - "content": row["content"], - "metadata": row["metadata"], - "category": row["category"], - } - for row in cursor.fetchall() - ] - conn.close() - - for chunk in chunks_data: - chunk_id = chunk.get("id") - distance = next( - ( - d - for cid, d, candidate_index_path in coarse_candidates - if cid == chunk_id and candidate_index_path == index_path - ), - 1.0 - ) - # Convert cosine distance to score (clamp to [0, 1] for Pydantic validation) - # Cosine distance can be > 1 for anti-correlated vectors, causing negative scores - score = max(0.0, 1.0 - distance) - - content = chunk.get("content", "") - result = SearchResult( - path=chunk.get("file_path", ""), - score=float(score), - excerpt=content[:500] if content else "", - content=content, - ) - coarse_results.append(result) - except Exception as exc: - self.logger.debug( - "Failed to retrieve chunks from %s: %s", index_path, exc - ) - - if not coarse_results: - stats.time_ms = (time.time() - start_time) * 1000 - return ChainSearchResult( - query=query, results=[], symbols=[], stats=stats - ) - - coarse_results = self._inject_query_feature_anchors( - query, - source_path, - options, - coarse_results, - limit=min(6, max(2, k)), - ) - - self.logger.debug( - "Retrieved %d chunks for cross-encoder reranking", len(coarse_results) - ) - - # Step 6: Cross-encoder reranking - rerank_time = time.time() - rerank_limit = self._resolve_rerank_candidate_limit(k, len(coarse_results)) - reranked_results = self._cross_encoder_rerank( - query, - coarse_results, - top_k=rerank_limit, - ) - - self.logger.debug( - "Cross-encoder reranking: %d results in %.2fms", - len(reranked_results), (time.time() - rerank_time) * 1000 - ) - - # Deduplicate by path (keep highest score) - path_to_result: Dict[str, SearchResult] = {} - for result in reranked_results: - if result.path not in path_to_result or result.score > path_to_result[result.path].score: - path_to_result[result.path] = result - - final_results = self._apply_default_path_penalties( - query, - list(path_to_result.values()), - )[:k] - - stats.files_matched = len(final_results) - stats.time_ms = (time.time() - start_time) * 1000 - - self.logger.debug( - "Dense+Rerank cascade search complete: %d results in %.2fms", - len(final_results), - stats.time_ms, - ) - - return ChainSearchResult( - query=query, - results=final_results, - symbols=[], - stats=stats, - ) - - def _get_or_create_binary_index(self, index_path: Path) -> Optional[Any]: - """Get or create a BinaryANNIndex for the given index path. - - .. deprecated:: - This method uses the deprecated BinaryANNIndex. For centralized indexes, - use _get_centralized_binary_searcher() instead. - - Attempts to load an existing binary index from disk. If not found, - returns None (binary index should be built during indexing). - - Args: - index_path: Path to the _index.db file - - Returns: - BinaryANNIndex instance or None if not available - """ - try: - import warnings - # Suppress deprecation warning since we're using it intentionally for legacy support - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=DeprecationWarning) - from codexlens.semantic.ann_index import BinaryANNIndex - - binary_index = BinaryANNIndex(index_path, dim=256) - if binary_index.load(): - return binary_index - return None - except Exception as exc: - self.logger.debug("Failed to load binary index for %s: %s", index_path, exc) - return None - - def _get_centralized_binary_searcher(self, index_root: Path) -> Optional[Any]: - """Get centralized BinarySearcher for memory-mapped binary vectors. - - This is the preferred method for centralized indexes, providing faster - search via memory-mapped files. - - Args: - index_root: Root directory containing centralized index files - - Returns: - BinarySearcher instance or None if not available - """ - try: - from codexlens.search.binary_searcher import BinarySearcher - - binary_searcher = BinarySearcher(index_root) - if binary_searcher.load(): - self.logger.debug( - "Using centralized BinarySearcher with %d vectors (mmap=%s)", - binary_searcher.vector_count, - binary_searcher.is_memmap - ) - return binary_searcher - return None - except Exception as exc: - self.logger.debug("Failed to load centralized binary searcher: %s", exc) - return None - - def _find_nearest_binary_mmap_root(self, index_root: Path, *, max_levels: int = 10) -> Path: - """Walk up index_root parents to find the nearest centralized binary mmap. - - Centralized staged-binary artifacts are stored at a project index root - (e.g. `.../project/src/_binary_vectors.mmap`), but staged search often starts - from the nearest ancestor `_index.db` path, which can be nested deeper. - - This helper makes Stage 1 robust by locating the nearest ancestor directory - that contains the centralized `_binary_vectors.mmap`. - """ - current_dir = Path(index_root).resolve() - for _ in range(max(0, int(max_levels)) + 1): - try: - if (current_dir / BINARY_VECTORS_MMAP_NAME).exists(): - return current_dir - except Exception: - return Path(index_root).resolve() - - parent = current_dir.parent - if parent == current_dir: - break - current_dir = parent - - return Path(index_root).resolve() - - def _find_nearest_dense_hnsw_root( - self, - index_root: Path, - *, - max_levels: int = 10, - ) -> Optional[Path]: - """Walk up index_root parents to find the nearest centralized dense HNSW root.""" - - current_dir = Path(index_root).resolve() - for _ in range(max(0, int(max_levels)) + 1): - try: - if (current_dir / VECTORS_HNSW_NAME).exists(): - return current_dir - except Exception: - return None - - parent = current_dir.parent - if parent == current_dir: - break - current_dir = parent - - return None - - def _group_index_paths_by_binary_root( - self, - index_paths: List[Path], - *, - preferred_root: Optional[Path] = None, - ) -> Tuple[List[Path], List[Path]]: - """Group collected indexes by centralized binary mmap root.""" - - grouped: Dict[Path, List[Path]] = {} - ungrouped: List[Path] = [] - preferred_root = ( - Path(preferred_root).resolve() - if preferred_root is not None - else None - ) - - for index_path in index_paths: - candidate_roots: List[Path] = [index_path.parent] - if preferred_root is not None and preferred_root != index_path.parent: - candidate_roots.append(preferred_root) - - resolved_root: Optional[Path] = None - for candidate_root in candidate_roots: - found_root = self._find_nearest_binary_mmap_root(candidate_root) - if (found_root / BINARY_VECTORS_MMAP_NAME).exists(): - resolved_root = found_root - break - - if resolved_root is None: - ungrouped.append(index_path) - continue - - grouped.setdefault(resolved_root, []).append(index_path) - - return [root for root in grouped if grouped[root]], ungrouped - - def _group_index_paths_by_dense_root( - self, - index_paths: List[Path], - ) -> Tuple[List[Path], List[Path]]: - """Group collected indexes by centralized dense HNSW root.""" - - grouped: Dict[Path, List[Path]] = {} - ungrouped: List[Path] = [] - - for index_path in index_paths: - dense_root = self._find_nearest_dense_hnsw_root(index_path.parent) - if dense_root is None: - ungrouped.append(index_path) - continue - grouped.setdefault(dense_root, []).append(index_path) - - return [root for root in grouped if grouped[root]], ungrouped - - def _group_dense_roots_by_embedding_settings( - self, - dense_roots: List[Path], - ) -> Dict[Tuple[str, str, bool], List[Path]]: - """Group dense roots by the embedding settings used to build them.""" - grouped: Dict[Tuple[str, str, bool], List[Path]] = {} - for dense_root in dense_roots: - settings = self._resolve_dense_embedding_settings(index_root=dense_root) - grouped.setdefault(settings, []).append(dense_root) - return grouped - - def _group_dense_index_paths_by_embedding_settings( - self, - index_paths: List[Path], - ) -> Dict[Tuple[str, str, bool], List[Path]]: - """Group legacy dense ANN indexes by the embedding settings used to query them.""" - grouped: Dict[Tuple[str, str, bool], List[Path]] = {} - for index_path in index_paths: - settings = self._resolve_dense_embedding_settings( - index_root=index_path.parent, - ) - grouped.setdefault(settings, []).append(index_path) - return grouped - - @staticmethod - def _interleave_dense_candidate_groups( - candidate_groups: List[List[Tuple[int, float, Path]]], - limit: int, - ) -> List[Tuple[int, float, Path]]: - """Interleave locally ranked dense candidates from mixed embedding groups.""" - if limit <= 0: - return [] - - ordered_groups = [ - sorted(group, key=lambda item: item[1]) - for group in candidate_groups - if group - ] - if not ordered_groups: - return [] - if len(ordered_groups) == 1: - return ordered_groups[0][:limit] - - merged: List[Tuple[int, float, Path]] = [] - offsets = [0 for _ in ordered_groups] - while len(merged) < limit: - made_progress = False - for group_index, group in enumerate(ordered_groups): - offset = offsets[group_index] - if offset >= len(group): - continue - merged.append(group[offset]) - offsets[group_index] += 1 - made_progress = True - if len(merged) >= limit: - break - if not made_progress: - break - return merged - - def _resolve_dense_embedding_settings( - self, - *, - index_root: Optional[Path], - ) -> Tuple[str, str, bool]: - """Resolve embedding backend/profile for a dense vector root.""" - - embedding_backend = "litellm" - embedding_model = "qwen3-embedding-sf" - use_gpu = True - loaded_from_root = False - - if index_root is not None: - central_index_db = index_root / "_index.db" - if central_index_db.exists(): - try: - from codexlens.semantic.vector_store import VectorStore - - with VectorStore(central_index_db) as vs: - model_config = vs.get_model_config() - if model_config: - embedding_backend = str( - model_config.get("backend", embedding_backend) - ) - if embedding_backend == "litellm": - embedding_model = str( - model_config.get("model_name", embedding_model) - ) - else: - embedding_model = str( - model_config.get( - "model_profile", - model_config.get("model_name", embedding_model), - ) - ) - loaded_from_root = True - except Exception as exc: - self.logger.debug( - "Failed to read dense embedding config from %s: %s", - central_index_db, - exc, - ) - - if self._config is not None: - if not loaded_from_root: - config_backend = getattr(self._config, "embedding_backend", None) - config_model = getattr(self._config, "embedding_model", None) - if config_backend: - embedding_backend = str(config_backend) - if config_model: - embedding_model = str(config_model) - use_gpu = bool(getattr(self._config, "embedding_use_gpu", True)) - - return embedding_backend, embedding_model, use_gpu - - def _embed_dense_query( - self, - query: str, - *, - index_root: Optional[Path], - query_cache: Optional[Dict[Tuple[str, str, bool], "np.ndarray"]] = None, - ) -> "np.ndarray": - """Embed a query using the model configuration associated with a dense root.""" - - from codexlens.semantic.factory import get_embedder - - embedding_backend, embedding_model, use_gpu = self._resolve_dense_embedding_settings( - index_root=index_root, - ) - cache_key = (embedding_backend, embedding_model, use_gpu) - if query_cache is not None and cache_key in query_cache: - return query_cache[cache_key] - - if embedding_backend == "litellm": - embedder = get_embedder(backend="litellm", model=embedding_model) - else: - embedder = get_embedder( - backend="fastembed", - profile=embedding_model, - use_gpu=use_gpu, - ) - - query_dense = embedder.embed_to_numpy([query])[0] - if query_cache is not None: - query_cache[cache_key] = query_dense - - self.logger.debug( - "Dense query embedding: %d-dim via %s/%s", - int(query_dense.shape[0]), - embedding_backend, - embedding_model, - ) - return query_dense - - def _embed_query_for_binary_searcher( - self, - query: str, - *, - binary_searcher: Any, - query_cache: Optional[Dict[Tuple[str, str, bool], "np.ndarray"]] = None, - ) -> "np.ndarray": - """Embed a query using the model configuration exposed by BinarySearcher.""" - - use_gpu = True - if self._config is not None: - use_gpu = getattr(self._config, "embedding_use_gpu", True) - - query_dense = None - backend = getattr(binary_searcher, "backend", None) - model = getattr(binary_searcher, "model", None) - profile = getattr(binary_searcher, "model_profile", None) or "code" - cache_key = ( - str(backend or "fastembed"), - str(model or profile), - bool(use_gpu), - ) - - if query_cache is not None and cache_key in query_cache: - return query_cache[cache_key] - - if backend == "litellm": - try: - from codexlens.semantic.factory import get_embedder as get_factory_embedder - - embedder = get_factory_embedder( - backend="litellm", - model=model or "code", - ) - query_dense = embedder.embed_to_numpy([query])[0] - except Exception: - query_dense = None - - if query_dense is None: - from codexlens.semantic.embedder import get_embedder - - embedder = get_embedder(profile=str(profile), use_gpu=use_gpu) - query_dense = embedder.embed_to_numpy([query])[0] - - if query_cache is not None: - query_cache[cache_key] = query_dense - - return query_dense - - def _collect_binary_coarse_candidates( - self, - query: str, - index_paths: List[Path], - coarse_k: int, - stats: SearchStats, - *, - index_root: Optional[Path] = None, - allow_dense_fallback: bool = False, - ) -> Tuple[List[Tuple[int, float, Path]], bool, bool, Optional[Path]]: - """Collect coarse candidates from centralized/legacy binary indexes.""" - - try: - from codexlens.indexing.embedding import BinaryEmbeddingBackend - except ImportError as exc: - self.logger.warning( - "BinaryEmbeddingBackend not available: %s", exc - ) - return [], False, False, None - - requested_index_root = ( - Path(index_root).resolve() - if index_root is not None - else (index_paths[0].parent if index_paths else None) - ) - coarse_candidates: List[Tuple[int, float, Path]] = [] - used_centralized = False - using_dense_fallback = False - dense_query_cache: Dict[Tuple[str, str, bool], "np.ndarray"] = {} - binary_roots_with_hits: set[Path] = set() - stage2_index_root: Optional[Path] = None - - binary_root_groups, _ = self._group_index_paths_by_binary_root( - index_paths, - preferred_root=requested_index_root, - ) - for binary_root in binary_root_groups: - binary_searcher = self._get_centralized_binary_searcher(binary_root) - if binary_searcher is None: - continue - try: - query_dense = self._embed_query_for_binary_searcher( - query, - binary_searcher=binary_searcher, - query_cache=dense_query_cache, - ) - results = binary_searcher.search(query_dense, top_k=coarse_k) - for chunk_id, distance in results: - coarse_candidates.append((chunk_id, float(distance), binary_root)) - if results: - used_centralized = True - binary_roots_with_hits.add(binary_root) - self.logger.debug( - "Centralized binary search found %d candidates from %s", - len(results), - binary_root, - ) - except Exception as exc: - self.logger.debug( - "Centralized binary search failed for %s: %s", - binary_root, - exc, - ) - - if len(binary_roots_with_hits) == 1: - stage2_index_root = next(iter(binary_roots_with_hits)) - - if not used_centralized: - has_legacy_binary_vectors = any( - (p.parent / f"{p.stem}_binary_vectors.bin").exists() for p in index_paths - ) - if has_legacy_binary_vectors: - use_gpu = True - if self._config is not None: - use_gpu = getattr(self._config, "embedding_use_gpu", True) - - query_binary = None - try: - binary_backend = BinaryEmbeddingBackend(use_gpu=use_gpu) - query_binary = binary_backend.embed_packed([query])[0] - except Exception as exc: - self.logger.warning(f"Failed to generate binary query embedding: {exc}") - query_binary = None - - if query_binary is not None: - for index_path in index_paths: - try: - binary_index = self._get_or_create_binary_index(index_path) - if binary_index is None or binary_index.count() == 0: - continue - ids, distances = binary_index.search(query_binary, coarse_k) - for chunk_id, dist in zip(ids, distances): - coarse_candidates.append((chunk_id, float(dist), index_path)) - except Exception as exc: - self.logger.debug( - "Binary search failed for %s: %s", index_path, exc - ) - stats.errors.append( - f"Binary search failed for {index_path}: {exc}" - ) - else: - self.logger.debug( - "No legacy binary vector files found; skipping legacy binary search fallback" - ) - - if not coarse_candidates and allow_dense_fallback: - dense_candidates: List[Tuple[int, float, Path]] = [] - dense_roots_with_hits: set[Path] = set() - try: - from codexlens.semantic.ann_index import ANNIndex - - dense_root_groups, dense_fallback_index_paths = self._group_index_paths_by_dense_root(index_paths) - dense_candidate_groups: List[List[Tuple[int, float, Path]]] = [] - dense_roots_by_settings = self._group_dense_roots_by_embedding_settings( - dense_root_groups - ) - if len(dense_roots_by_settings) > 1: - self.logger.debug( - "Stage 1 dense fallback detected %d embedding setting groups; interleaving candidates across groups", - len(dense_roots_by_settings), - ) - for dense_roots in dense_roots_by_settings.values(): - group_candidates: List[Tuple[int, float, Path]] = [] - for dense_root in dense_roots: - try: - query_dense = self._embed_dense_query( - query, - index_root=dense_root, - query_cache=dense_query_cache, - ) - ann_index = self._get_cached_centralized_dense_index( - dense_root, - int(query_dense.shape[0]), - ) - if ann_index is None: - continue - ids, distances = ann_index.search(query_dense, top_k=coarse_k) - for chunk_id, dist in zip(ids, distances): - group_candidates.append((chunk_id, float(dist), dense_root)) - if ids: - dense_roots_with_hits.add(dense_root) - self.logger.debug( - "Stage 1 centralized dense fallback: %d candidates from %s", - len(ids), - dense_root, - ) - except Exception as exc: - self.logger.debug( - "Dense coarse search failed for %s: %s", - dense_root, - exc, - ) - if group_candidates: - dense_candidate_groups.append(group_candidates) - - dense_candidates = self._interleave_dense_candidate_groups( - dense_candidate_groups, - coarse_k, - ) - - fallback_index_paths = dense_fallback_index_paths if dense_root_groups else index_paths - if not dense_candidates: - fallback_candidate_groups: List[List[Tuple[int, float, Path]]] = [] - fallback_index_groups = self._group_dense_index_paths_by_embedding_settings( - fallback_index_paths - ) - if len(fallback_index_groups) > 1: - self.logger.debug( - "Stage 1 legacy dense fallback detected %d embedding setting groups; interleaving candidates across groups", - len(fallback_index_groups), - ) - for grouped_index_paths in fallback_index_groups.values(): - group_candidates = [] - for index_path in grouped_index_paths: - try: - query_dense = self._embed_dense_query( - query, - index_root=index_path.parent, - query_cache=dense_query_cache, - ) - ann_index = self._get_cached_legacy_dense_index( - index_path, - int(query_dense.shape[0]), - ) - if ann_index is None: - continue - ids, distances = ann_index.search(query_dense, top_k=coarse_k) - for chunk_id, dist in zip(ids, distances): - group_candidates.append((chunk_id, float(dist), index_path)) - except Exception as exc: - self.logger.debug( - "Dense coarse search failed for %s: %s", index_path, exc - ) - if group_candidates: - fallback_candidate_groups.append(group_candidates) - - dense_candidates = self._interleave_dense_candidate_groups( - fallback_candidate_groups, - coarse_k, - ) - except Exception as exc: - self.logger.debug("Dense coarse search fallback unavailable: %s", exc) - dense_candidates = [] - - if dense_candidates: - if stage2_index_root is None and len(dense_roots_with_hits) == 1: - stage2_index_root = next(iter(dense_roots_with_hits)) - coarse_candidates = dense_candidates - using_dense_fallback = True - - if coarse_candidates: - if using_dense_fallback: - coarse_candidates = coarse_candidates[:coarse_k] - else: - coarse_candidates.sort(key=lambda x: x[1]) - coarse_candidates = coarse_candidates[:coarse_k] - - return coarse_candidates, used_centralized, using_dense_fallback, stage2_index_root - - def _materialize_binary_candidates( - self, - coarse_candidates: List[Tuple[int, float, Path]], - stats: SearchStats, - *, - stage2_index_root: Optional[Path] = None, - using_dense_fallback: bool = False, - ) -> List[SearchResult]: - """Fetch chunk payloads for coarse binary/dense-fallback candidates.""" - - if not coarse_candidates: - return [] - - coarse_results: List[Tuple[int, SearchResult]] = [] - candidates_by_index: Dict[Path, List[int]] = {} - candidate_order: Dict[Tuple[Path, int], int] = {} - for chunk_id, _, idx_path in coarse_candidates: - if idx_path not in candidates_by_index: - candidates_by_index[idx_path] = [] - candidates_by_index[idx_path].append(chunk_id) - candidate_order.setdefault((idx_path, int(chunk_id)), len(candidate_order)) - - import sqlite3 - - central_meta_store = None - central_meta_path = stage2_index_root / VECTORS_META_DB_NAME if stage2_index_root else None - if central_meta_path and central_meta_path.exists(): - central_meta_store = VectorMetadataStore(central_meta_path) - - for idx_path, chunk_ids in candidates_by_index.items(): - try: - chunks_data = [] - if central_meta_store is not None and stage2_index_root is not None and idx_path == stage2_index_root: - chunks_data = central_meta_store.get_chunks_by_ids(chunk_ids) - - if not chunks_data and idx_path.name != "_index.db": - meta_db_path = idx_path / VECTORS_META_DB_NAME - if meta_db_path.exists(): - meta_store = VectorMetadataStore(meta_db_path) - chunks_data = meta_store.get_chunks_by_ids(chunk_ids) - - if not chunks_data: - try: - conn = sqlite3.connect(str(idx_path)) - conn.row_factory = sqlite3.Row - placeholders = ",".join("?" * len(chunk_ids)) - cursor = conn.execute( - f""" - SELECT id, file_path, content, metadata, category - FROM semantic_chunks - WHERE id IN ({placeholders}) - """, - chunk_ids, - ) - chunks_data = [ - { - "id": row["id"], - "file_path": row["file_path"], - "content": row["content"], - "metadata": row["metadata"], - "category": row["category"], - } - for row in cursor.fetchall() - ] - conn.close() - except Exception: - chunks_data = [] - - for chunk in chunks_data: - chunk_id = chunk.get("id") or chunk.get("chunk_id") - distance = next( - ( - d - for cid, d, candidate_idx_path in coarse_candidates - if cid == chunk_id and candidate_idx_path == idx_path - ), - 256, - ) - if using_dense_fallback: - score = max(0.0, 1.0 - float(distance)) - else: - score = 1.0 - (float(distance) / 256.0) - - content = chunk.get("content", "") - metadata = chunk.get("metadata") - symbol_name = None - symbol_kind = None - start_line = chunk.get("start_line") - end_line = chunk.get("end_line") - if metadata: - try: - meta_dict = json.loads(metadata) if isinstance(metadata, str) else metadata - symbol_name = meta_dict.get("symbol_name") - symbol_kind = meta_dict.get("symbol_kind") - start_line = meta_dict.get("start_line", start_line) - end_line = meta_dict.get("end_line", end_line) - except Exception: - pass - - coarse_results.append( - ( - candidate_order.get((idx_path, int(chunk_id)), len(candidate_order)), - SearchResult( - path=chunk.get("file_path", ""), - score=float(score), - excerpt=content[:500] if content else "", - content=content, - symbol_name=symbol_name, - symbol_kind=symbol_kind, - start_line=start_line, - end_line=end_line, - ), - ) - ) - except Exception as exc: - self.logger.debug( - "Failed to retrieve chunks from %s: %s", idx_path, exc - ) - stats.errors.append( - f"Stage 1 chunk retrieval failed for {idx_path}: {exc}" - ) - - coarse_results.sort(key=lambda item: item[0]) - return [result for _, result in coarse_results] - - def _compute_cosine_similarity( - self, - query_vec: "np.ndarray", - doc_vec: "np.ndarray", - ) -> float: - """Compute cosine similarity between query and document vectors. - - Args: - query_vec: Query embedding vector - doc_vec: Document embedding vector - - Returns: - Cosine similarity score in range [-1, 1] - """ - if not NUMPY_AVAILABLE: - return 0.0 - - # Ensure same shape - min_len = min(len(query_vec), len(doc_vec)) - q = query_vec[:min_len] - d = doc_vec[:min_len] - - # Compute cosine similarity - dot_product = np.dot(q, d) - norm_q = np.linalg.norm(q) - norm_d = np.linalg.norm(d) - - if norm_q == 0 or norm_d == 0: - return 0.0 - - return float(dot_product / (norm_q * norm_d)) - - def _compute_cosine_similarity_batch( - self, - query_vec: "np.ndarray", - doc_matrix: "np.ndarray", - ) -> "np.ndarray": - """Compute cosine similarity between query and multiple document vectors. - - Uses vectorized matrix operations for efficient batch computation. - - Args: - query_vec: Query embedding vector of shape (dim,) - doc_matrix: Document embeddings matrix of shape (n_docs, dim) - - Returns: - Array of cosine similarity scores of shape (n_docs,) - """ - if not NUMPY_AVAILABLE: - return np.zeros(doc_matrix.shape[0]) - - # Ensure query is 1D - if query_vec.ndim > 1: - query_vec = query_vec.flatten() - - # Handle dimension mismatch by truncating to smaller dimension - min_dim = min(len(query_vec), doc_matrix.shape[1]) - q = query_vec[:min_dim] - docs = doc_matrix[:, :min_dim] - - # Compute query norm once - norm_q = np.linalg.norm(q) - if norm_q == 0: - return np.zeros(docs.shape[0]) - - # Normalize query - q_normalized = q / norm_q - - # Compute document norms (vectorized) - doc_norms = np.linalg.norm(docs, axis=1) - - # Avoid division by zero - nonzero_mask = doc_norms > 0 - scores = np.zeros(docs.shape[0], dtype=np.float32) - - if np.any(nonzero_mask): - # Normalize documents with non-zero norms - docs_normalized = docs[nonzero_mask] / doc_norms[nonzero_mask, np.newaxis] - - # Batch dot product: (n_docs, dim) @ (dim,) = (n_docs,) - scores[nonzero_mask] = docs_normalized @ q_normalized - - return scores - - def _build_results_from_candidates( - self, - candidates: List[Tuple[int, int, Path]], - index_paths: List[Path], - stats: SearchStats, - query: str, - start_time: float, - use_centralized: bool = False, - ) -> ChainSearchResult: - """Build ChainSearchResult from binary candidates using Hamming distance scores. - - Used as fallback when dense embeddings are not available. - - Args: - candidates: List of (chunk_id, hamming_distance, index_path) tuples - index_paths: List of all searched index paths - stats: SearchStats to update - query: Original query string - start_time: Search start time for timing - use_centralized: If True, index_path is the index_root directory - and VectorMetadataStore should be used instead of SQLiteStore - - Returns: - ChainSearchResult with results scored by Hamming distance - """ - results: List[SearchResult] = [] - - # Group by index path - candidates_by_index: Dict[Path, List[Tuple[int, int]]] = {} - for chunk_id, distance, index_path in candidates: - if index_path not in candidates_by_index: - candidates_by_index[index_path] = [] - candidates_by_index[index_path].append((chunk_id, distance)) - - for index_path, chunk_tuples in candidates_by_index.items(): - try: - chunk_ids = [c[0] for c in chunk_tuples] - - # Use VectorMetadataStore for centralized search, SQLiteStore for per-directory - if use_centralized: - # index_path is actually index_root directory for centralized search - meta_db_path = index_path / VECTORS_META_DB_NAME - if not meta_db_path.exists(): - self.logger.debug( - "VectorMetadataStore not found at %s, skipping", meta_db_path - ) - continue - meta_store = VectorMetadataStore(meta_db_path) - chunks_data = meta_store.get_chunks_by_ids(chunk_ids) - else: - store = SQLiteStore(index_path) - chunks_data = store.get_chunks_by_ids(chunk_ids) - - chunk_content: Dict[int, Dict[str, Any]] = { - c["id"]: c for c in chunks_data - } - - for chunk_id, distance in chunk_tuples: - chunk_info = chunk_content.get(chunk_id) - if chunk_info is None: - continue - - # Convert Hamming distance to score (lower distance = higher score) - # Max Hamming distance for 256-bit is 256 - score = 1.0 - (distance / 256.0) - - excerpt = chunk_info.get("content", "")[:500] - result = SearchResult( - path=chunk_info.get("file_path", ""), - score=float(score), - excerpt=excerpt, - ) - results.append(result) - - except Exception as exc: - self.logger.debug( - "Failed to build results from %s: %s", index_path, exc - ) - - # Deduplicate by path - path_to_result: Dict[str, SearchResult] = {} - for result in results: - if result.path not in path_to_result or result.score > path_to_result[result.path].score: - path_to_result[result.path] = result - - final_results = sorted( - path_to_result.values(), - key=lambda r: r.score, - reverse=True, - ) - - stats.files_matched = len(final_results) - stats.time_ms = (time.time() - start_time) * 1000 - - return ChainSearchResult( - query=query, - results=final_results, - symbols=[], - stats=stats, - ) - - def _cross_encoder_rerank( - self, - query: str, - results: List[SearchResult], - top_k: int, - ) -> List[SearchResult]: - """Rerank results using cross-encoder model. - - Args: - query: Search query string - results: Candidate results to rerank - top_k: Number of top results to return - - Returns: - Reranked results sorted by cross-encoder score - """ - if not results: - return [] - - # Collapse duplicate chunks from the same file before reranking. - # Otherwise, untouched tail chunks can overwrite reranked chunks for the - # same path during the later path-level deduplication step. - path_to_result: Dict[str, SearchResult] = {} - for result in results: - path = result.path - if path not in path_to_result or result.score > path_to_result[path].score: - path_to_result[path] = result - if len(path_to_result) != len(results): - self.logger.debug( - "Deduplicated rerank candidates by path: %d -> %d", - len(results), - len(path_to_result), - ) - results = sorted( - path_to_result.values(), - key=lambda item: float(item.score), - reverse=True, - ) - - reranker = self._get_cached_reranker() - if reranker is None: - return results[:top_k] - - # Use cross_encoder_rerank from ranking module - from codexlens.search.ranking import cross_encoder_rerank - - # Get chunk_type weights and test_file_penalty from config - chunk_type_weights = None - test_file_penalty = 0.0 - - if self._config is not None: - chunk_type_weights = getattr(self._config, "reranker_chunk_type_weights", None) - test_file_penalty = getattr(self._config, "reranker_test_file_penalty", 0.0) - - return cross_encoder_rerank( - query=query, - results=results, - reranker=reranker, - top_k=top_k, - batch_size=32, - chunk_type_weights=chunk_type_weights, - test_file_penalty=test_file_penalty, - ) - - def search_files_only(self, query: str, - source_path: Path, - options: Optional[SearchOptions] = None) -> List[str]: - """Search and return only matching file paths. - - Faster than full search when excerpts are not needed. - - Args: - query: FTS5 search query string - source_path: Starting directory path - options: Search configuration (uses defaults if None) - - Returns: - List of file paths as strings - - Examples: - >>> engine = ChainSearchEngine(registry, mapper) - >>> paths = engine.search_files_only("TODO", Path("D:/project")) - >>> print(f"Found {len(paths)} files with TODOs") - """ - options = options or SearchOptions() - options.files_only = True - - result = self.search(query, source_path, options) - return [r.path for r in result.results] - - def search_symbols(self, name: str, - source_path: Path, - kind: Optional[str] = None, - options: Optional[SearchOptions] = None) -> List[Symbol]: - """Chain symbol search across directory hierarchy. - - Args: - name: Symbol name pattern (partial match supported) - source_path: Starting directory path - kind: Optional symbol kind filter (e.g., 'function', 'class') - options: Search configuration (uses defaults if None) - - Returns: - List of Symbol objects sorted by name - - Examples: - >>> engine = ChainSearchEngine(registry, mapper) - >>> funcs = engine.search_symbols("init", Path("D:/project"), kind="function") - >>> for sym in funcs[:10]: - ... print(f"{sym.name} ({sym.kind}): lines {sym.range}") - """ - options = options or SearchOptions() - - start_index = self._find_start_index(source_path) - if not start_index: - self.logger.warning(f"No index found for {source_path}") - return [] - - # Fast path: project-wide global symbol index (avoids chain traversal). - if self._config is None or getattr(self._config, "global_symbol_index_enabled", True): - try: - # Avoid relying on index_to_source() here; use the same logic as _find_start_index - # to determine the effective search root directory. - search_root = source_path.resolve() - exact_index = self.mapper.source_to_index_db(search_root) - if not exact_index.exists(): - nearest = self.registry.find_nearest_index(search_root) - if nearest: - search_root = nearest.source_path - - project = self.registry.find_by_source_path(str(search_root)) - if project: - global_db_path = Path(project["index_root"]) / GlobalSymbolIndex.DEFAULT_DB_NAME - if global_db_path.exists(): - query_limit = max(int(options.total_limit) * 10, int(options.total_limit)) - with GlobalSymbolIndex(global_db_path, project_id=int(project["id"])) as global_index: - candidates = global_index.search(name=name, kind=kind, limit=query_limit) - - # Apply depth constraint relative to the start index directory. - filtered: List[Symbol] = [] - for sym in candidates: - if not sym.file: - continue - try: - root_str = str(search_root) - file_dir_str = str(Path(sym.file).parent) - - # Normalize Windows long-path prefix (\\?\) if present. - if root_str.startswith("\\\\?\\"): - root_str = root_str[4:] - if file_dir_str.startswith("\\\\?\\"): - file_dir_str = file_dir_str[4:] - - root_cmp = root_str.lower().rstrip("\\/") - dir_cmp = file_dir_str.lower().rstrip("\\/") - - # Guard against Windows cross-drive comparisons (ValueError). - if os.name == "nt": - root_drive, _ = os.path.splitdrive(root_cmp) - dir_drive, _ = os.path.splitdrive(dir_cmp) - if root_drive and dir_drive and root_drive != dir_drive: - self.logger.debug( - "Skipping symbol due to cross-drive path (root=%s file=%s name=%s)", - root_cmp, - sym.file, - sym.name, - ) - continue - - if os.path.commonpath([root_cmp, dir_cmp]) != root_cmp: - continue - - rel = os.path.relpath(dir_cmp, root_cmp) - rel_depth = 0 if rel == "." else len(rel.split(os.sep)) - except ValueError as exc: - self.logger.debug( - "Skipping symbol due to path operation failure (root=%s file=%s name=%s): %s", - str(search_root), - sym.file, - sym.name, - exc, - ) - continue - except Exception as exc: - self.logger.debug( - "Skipping symbol due to unexpected path error (root=%s file=%s name=%s): %s", - str(search_root), - sym.file, - sym.name, - exc, - ) - continue - - if options.depth >= 0 and rel_depth > options.depth: - continue - filtered.append(sym) - - if filtered: - # Match existing semantics: dedupe by (name, kind, range), sort by name. - seen = set() - unique_symbols: List[Symbol] = [] - for sym in filtered: - key = (sym.name, sym.kind, sym.range) - if key in seen: - continue - seen.add(key) - unique_symbols.append(sym) - unique_symbols.sort(key=lambda s: s.name) - return unique_symbols[: options.total_limit] - except Exception as exc: - self.logger.debug("Global symbol index fast path failed: %s", exc) - - index_paths = self._collect_index_paths(start_index, options.depth) - if not index_paths: - return [] - - return self._search_symbols_parallel( - index_paths, name, kind, options.total_limit - ) - - def search_references( - self, - symbol_name: str, - source_path: Optional[Path] = None, - depth: int = -1, - limit: int = 100, - ) -> List[ReferenceResult]: - """Find all references to a symbol across the project. - - Searches the code_relationships table in all index databases to find - where the given symbol is referenced (called, imported, inherited, etc.). - - Args: - symbol_name: Fully qualified or simple name of the symbol to find references to - source_path: Starting path for search (default: workspace root from registry) - depth: Search depth (-1 = unlimited, 0 = current dir only) - limit: Maximum results to return (default 100) - - Returns: - List of ReferenceResult objects sorted by file path and line number - - Examples: - >>> engine = ChainSearchEngine(registry, mapper) - >>> refs = engine.search_references("authenticate", Path("D:/project/src")) - >>> for ref in refs[:10]: - ... print(f"{ref.file_path}:{ref.line} ({ref.relationship_type})") - """ - import sqlite3 - from concurrent.futures import as_completed - - # Determine starting path - if source_path is None: - # Try to get workspace root from registry - mappings = self.registry.list_mappings() - if mappings: - source_path = Path(mappings[0].source_path) - else: - self.logger.warning("No source path provided and no mappings in registry") - return [] - - # Find starting index - start_index = self._find_start_index(source_path) - if not start_index: - self.logger.warning(f"No index found for {source_path}") - return [] - - # Collect all index paths - index_paths = self._collect_index_paths(start_index, depth) - if not index_paths: - self.logger.debug(f"No indexes collected from {start_index}") - return [] - - self.logger.debug( - "Searching %d indexes for references to '%s'", - len(index_paths), symbol_name - ) - - # Search in parallel - all_results: List[ReferenceResult] = [] - executor = self._get_executor() - - def search_single_index(index_path: Path) -> List[ReferenceResult]: - """Search a single index for references.""" - results: List[ReferenceResult] = [] - try: - conn = sqlite3.connect(str(index_path), check_same_thread=False) - conn.row_factory = sqlite3.Row - - # Query code_relationships for references to this symbol - # Match either target_qualified_name containing the symbol name - # or an exact match on the last component - # Try full_path first (new schema), fallback to path (old schema) - try: - rows = conn.execute( - """ - SELECT DISTINCT - f.full_path as source_file, - cr.source_line, - cr.relationship_type, - f.content - FROM code_relationships cr - JOIN symbols s ON s.id = cr.source_symbol_id - JOIN files f ON f.id = s.file_id - WHERE cr.target_qualified_name LIKE ? - OR cr.target_qualified_name LIKE ? - OR cr.target_qualified_name = ? - ORDER BY f.full_path, cr.source_line - LIMIT ? - """, - ( - f"%{symbol_name}", # Ends with symbol name - f"%.{symbol_name}", # Qualified name ending with .symbol_name - symbol_name, # Exact match - limit, - ) - ).fetchall() - except sqlite3.OperationalError: - # Fallback for old schema with 'path' column - rows = conn.execute( - """ - SELECT DISTINCT - f.path as source_file, - cr.source_line, - cr.relationship_type, - f.content - FROM code_relationships cr - JOIN symbols s ON s.id = cr.source_symbol_id - JOIN files f ON f.id = s.file_id - WHERE cr.target_qualified_name LIKE ? - OR cr.target_qualified_name LIKE ? - OR cr.target_qualified_name = ? - ORDER BY f.path, cr.source_line - LIMIT ? - """, - ( - f"%{symbol_name}", # Ends with symbol name - f"%.{symbol_name}", # Qualified name ending with .symbol_name - symbol_name, # Exact match - limit, - ) - ).fetchall() - - for row in rows: - file_path = row["source_file"] - line = row["source_line"] or 1 - rel_type = row["relationship_type"] - content = row["content"] or "" - - # Extract context (3 lines around reference) - context = self._extract_context(content, line, context_lines=3) - - results.append(ReferenceResult( - file_path=file_path, - line=line, - column=0, # Column info not stored in code_relationships - context=context, - relationship_type=rel_type, - )) - - conn.close() - except sqlite3.DatabaseError as exc: - self.logger.debug( - "Failed to search references in %s: %s", index_path, exc - ) - except Exception as exc: - self.logger.debug( - "Unexpected error searching references in %s: %s", index_path, exc - ) - - return results - - # Submit parallel searches - futures = { - executor.submit(search_single_index, idx_path): idx_path - for idx_path in index_paths - } - - for future in as_completed(futures): - try: - results = future.result() - all_results.extend(results) - except Exception as exc: - idx_path = futures[future] - self.logger.debug( - "Reference search failed for %s: %s", idx_path, exc - ) - - # Deduplicate by (file_path, line) - seen: set = set() - unique_results: List[ReferenceResult] = [] - for ref in all_results: - key = (ref.file_path, ref.line) - if key not in seen: - seen.add(key) - unique_results.append(ref) - - # Sort by file path and line - unique_results.sort(key=lambda r: (r.file_path, r.line)) - - # Apply limit - return unique_results[:limit] - - def _extract_context( - self, - content: str, - line: int, - context_lines: int = 3 - ) -> str: - """Extract lines around a given line number from file content. - - Args: - content: Full file content - line: Target line number (1-based) - context_lines: Number of lines to include before and after - - Returns: - Context snippet as a string - """ - if not content: - return "" - - lines = content.splitlines() - total_lines = len(lines) - - if line < 1 or line > total_lines: - return "" - - # Calculate range (0-indexed internally) - start = max(0, line - 1 - context_lines) - end = min(total_lines, line + context_lines) - - context = lines[start:end] - return "\n".join(context) - - # === Internal Methods === - - def _find_start_index(self, source_path: Path) -> Optional[Path]: - """Find index database path for source directory. - - Attempts exact match first, then searches for nearest ancestor index. - - Args: - source_path: Source directory path - - Returns: - Path to _index.db file, or None if not found - """ - source_path = source_path.resolve() - - # Try exact match first - exact_index = self.mapper.source_to_index_db(source_path) - if exact_index.exists(): - self.logger.debug(f"Found exact index: {exact_index}") - return exact_index - - # Try nearest ancestor via registry - nearest = self.registry.find_nearest_index(source_path) - if nearest: - self.logger.debug(f"Found nearest index: {nearest.index_path}") - return nearest.index_path - - self.logger.warning(f"No index found for {source_path}") - return None - - def _collect_index_paths(self, start_index: Path, - depth: int) -> List[Path]: - """Recursively collect all subdirectory index paths. - - Traverses directory tree via subdirs table in each _index.db, - respecting depth limit. - - Args: - start_index: Starting _index.db path - depth: Maximum depth (-1 = unlimited, 0 = current only) - - Returns: - List of _index.db paths to search - """ - collected = [] - visited = set() - scan_root = start_index.parent.resolve() - try: - scan_source_root = self.mapper.index_to_source(start_index) - except Exception: - scan_source_root = None - - def _collect_recursive(index_path: Path, current_depth: int): - # Normalize path to avoid duplicates - normalized = index_path.resolve() - if normalized in visited: - return - visited.add(normalized) - - if is_ignored_index_path(normalized, scan_root): - self.logger.debug("Skipping ignored artifact index subtree: %s", normalized) - return - - # Add current index - if normalized.exists(): - collected.append(normalized) - else: - self.logger.debug(f"Index does not exist: {normalized}") - return - - # Check depth limit - if depth >= 0 and current_depth >= depth: - return - - # Read subdirs and recurse - try: - with DirIndexStore(normalized) as store: - subdirs = store.get_subdirs() - for subdir in subdirs: - _collect_recursive(subdir.index_path, current_depth + 1) - except Exception as exc: - self.logger.warning(f"Failed to read subdirs from {normalized}: {exc}") - - _collect_recursive(start_index, 0) - - if scan_source_root is not None: - try: - descendant_roots = self.registry.find_descendant_project_roots( - scan_source_root - ) - except Exception as exc: - descendant_roots = [] - self.logger.debug( - "Failed to query descendant project roots for %s: %s", - scan_source_root, - exc, - ) - - for mapping in descendant_roots: - try: - relative_depth = len( - mapping.source_path.resolve().relative_to( - scan_source_root.resolve() - ).parts - ) - except ValueError: - continue - if depth >= 0 and relative_depth > depth: - continue - _collect_recursive(mapping.index_path, relative_depth) - - self.logger.info(f"Collected {len(collected)} indexes (depth={depth})") - return collected - - def _search_parallel(self, index_paths: List[Path], - query: str, - options: SearchOptions) -> tuple[List[SearchResult], SearchStats]: - """Search multiple indexes in parallel using shared ThreadPoolExecutor. - - Args: - index_paths: List of _index.db paths to search - query: FTS5 query string - options: Search configuration - - Returns: - Tuple of (all results, search statistics) - """ - all_results = [] - stats = SearchStats() - - # Force single-threaded execution for vector/hybrid search to avoid GPU crashes - # DirectML/ONNX have threading issues when multiple threads access GPU resources - effective_workers = options.max_workers - if options.enable_vector or options.hybrid_mode: - effective_workers = 1 - self.logger.debug("Using single-threaded mode for vector search (GPU safety)") - # Pre-load embedder to avoid initialization overhead per-search - try: - from codexlens.semantic.factory import get_embedder as get_embedder_factory - - embedding_backend = "fastembed" - embedding_model = "code" - use_gpu = True - if self._config is not None: - embedding_backend = getattr(self._config, "embedding_backend", embedding_backend) or embedding_backend - embedding_model = getattr(self._config, "embedding_model", embedding_model) or embedding_model - use_gpu = bool(getattr(self._config, "embedding_use_gpu", use_gpu)) - - if embedding_backend == "litellm": - get_embedder_factory(backend="litellm", model=embedding_model) - else: - get_embedder_factory(backend="fastembed", profile=embedding_model, use_gpu=use_gpu) - except Exception: - pass # Ignore pre-load failures - - shared_hybrid_engine = None - if options.hybrid_mode: - shared_hybrid_engine = HybridSearchEngine( - weights=options.hybrid_weights, - config=self._config, - ) - - executor = self._get_executor(effective_workers) - # Submit all search tasks - future_to_path = { - executor.submit( - self._search_single_index, - idx_path, - query, - options.limit_per_dir, - options.files_only, - options.include_semantic, - options.hybrid_mode, - options.enable_fuzzy, - options.enable_vector, - options.pure_vector, - options.hybrid_weights, - shared_hybrid_engine, - ): idx_path - for idx_path in index_paths - } - - # Collect results as they complete - for future in as_completed(future_to_path): - idx_path = future_to_path[future] - try: - results = future.result() - all_results.extend(results) - self.logger.debug(f"Got {len(results)} results from {idx_path.parent.name}") - except Exception as exc: - error_msg = f"Search failed for {idx_path}: {exc}" - self.logger.error(error_msg) - stats.errors.append(error_msg) - - return all_results, stats - - def _search_single_index(self, index_path: Path, - query: str, - limit: int, - files_only: bool = False, - include_semantic: bool = False, - hybrid_mode: bool = False, - enable_fuzzy: bool = True, - enable_vector: bool = False, - pure_vector: bool = False, - hybrid_weights: Optional[Dict[str, float]] = None, - hybrid_engine: Optional[HybridSearchEngine] = None) -> List[SearchResult]: - """Search a single index database. - - Handles exceptions gracefully, returning empty list on failure. - - Args: - index_path: Path to _index.db file - query: FTS5 query string (for FTS) or natural language query (for vector) - limit: Maximum results from this index - files_only: If True, skip snippet generation for faster search - include_semantic: If True, also search semantic keywords and merge results - hybrid_mode: If True, use hybrid search with RRF fusion - enable_fuzzy: Enable fuzzy FTS in hybrid mode - enable_vector: Enable vector semantic search - pure_vector: If True, only use vector search without FTS fallback - hybrid_weights: Custom RRF weights for hybrid search - - Returns: - List of SearchResult objects (empty on error) - """ - try: - # Use hybrid search if enabled - if hybrid_mode: - engine = hybrid_engine or HybridSearchEngine( - weights=hybrid_weights, - config=self._config, - ) - fts_results = engine.search( - index_path, - query, - limit=limit, - enable_fuzzy=enable_fuzzy, - enable_vector=enable_vector, - pure_vector=pure_vector, - ) - else: - # Single-FTS search (exact or fuzzy mode) - with DirIndexStore(index_path) as store: - # Get FTS results - if files_only: - # Fast path: return paths only without snippets - paths = store.search_files_only(query, limit=limit) - fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths] - else: - # Use fuzzy FTS if enable_fuzzy=True (mode="fuzzy"), otherwise exact FTS - if enable_fuzzy: - fts_results = store.search_fts_fuzzy( - query, limit=limit, return_full_content=True - ) - else: - fts_results = store.search_fts_exact( - query, limit=limit, return_full_content=True - ) - - # Optionally add semantic keyword results - if include_semantic: - try: - semantic_matches = store.search_semantic_keywords(query) - # Convert semantic matches to SearchResult with 0.8x weight - for file_entry, keywords in semantic_matches: - # Create excerpt from keywords - excerpt = f"Keywords: {', '.join(keywords[:5])}" - # Use a base score of 10.0 for semantic matches, weighted by 0.8 - semantic_result = SearchResult( - path=str(file_entry.full_path), - score=10.0 * 0.8, - excerpt=excerpt - ) - fts_results.append(semantic_result) - except Exception as sem_exc: - self.logger.debug(f"Semantic search error in {index_path}: {sem_exc}") - - return fts_results - except Exception as exc: - self.logger.debug(f"Search error in {index_path}: {exc}") - return [] - - def _filter_by_extension(self, results: List[SearchResult], - code_only: bool = False, - exclude_extensions: Optional[List[str]] = None) -> List[SearchResult]: - """Filter search results by file extension. - - Args: - results: Search results to filter - code_only: If True, exclude non-code files (md, txt, json, yaml, xml, etc.) - exclude_extensions: List of extensions to exclude (e.g., ["md", "txt"]) - - Returns: - Filtered results - """ - # Non-code file extensions (same as MCP tool smart-search.ts) - NON_CODE_EXTENSIONS = { - 'md', 'txt', 'json', 'yaml', 'yml', 'xml', 'csv', 'log', - 'ini', 'cfg', 'conf', 'toml', 'env', 'properties', - 'html', 'htm', 'svg', 'png', 'jpg', 'jpeg', 'gif', 'ico', 'webp', - 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx', - 'lock', 'sum', 'mod', - } - - # Build exclusion set - excluded_exts = set() - if exclude_extensions: - # Normalize extensions (remove leading dots, lowercase) - excluded_exts = {ext.lower().lstrip('.') for ext in exclude_extensions} - if code_only: - excluded_exts.update(NON_CODE_EXTENSIONS) - - if not excluded_exts: - return results - - # Filter results - filtered = [] - for result in results: - path_str = result.path - if not path_str: - continue - - # Extract extension from path - if '.' in path_str: - ext = path_str.rsplit('.', 1)[-1].lower() - if ext in excluded_exts: - continue # Skip this result - - filtered.append(result) - - return filtered - - def _merge_and_rank(self, results: List[SearchResult], - limit: int, offset: int = 0, query: Optional[str] = None) -> List[SearchResult]: - """Aggregate, deduplicate, and rank results. - - Process: - 1. Deduplicate by path (keep highest score) - 2. Sort by score descending - 3. Apply offset and limit for pagination - - Args: - results: Raw results from all indexes - limit: Maximum results to return - offset: Number of results to skip (pagination offset) - - Returns: - Deduplicated and ranked results with pagination - """ - # Deduplicate by path, keeping best score - path_to_result: Dict[str, SearchResult] = {} - for result in results: - path = result.path - if path not in path_to_result or result.score > path_to_result[path].score: - path_to_result[path] = result - - unique_results = list(path_to_result.values()) - if query: - unique_results = self._apply_default_path_penalties(query, unique_results) - else: - unique_results.sort(key=lambda r: r.score, reverse=True) - - # Apply offset and limit for pagination - return unique_results[offset:offset + limit] - - def _apply_default_path_penalties( - self, - query: str, - results: List[SearchResult], - ) -> List[SearchResult]: - """Apply default path penalties for noisy test and generated artifact results.""" - if not results: - return results - - test_penalty = 0.15 - generated_penalty = 0.35 - if self._config is not None: - test_penalty = float( - getattr(self._config, "test_file_penalty", test_penalty) or 0.0 - ) - generated_penalty = float( - getattr( - self._config, - "generated_file_penalty", - generated_penalty, - ) - or 0.0 - ) - if test_penalty <= 0 and generated_penalty <= 0: - return sorted(results, key=lambda r: r.score, reverse=True) - - from codexlens.search.ranking import ( - apply_path_penalties, - rebalance_noisy_results, - ) - - penalized = apply_path_penalties( - results, - query, - test_file_penalty=test_penalty, - generated_file_penalty=generated_penalty, - ) - return rebalance_noisy_results(penalized, query) - - def _resolve_rerank_candidate_limit( - self, - requested_k: int, - candidate_count: int, - ) -> int: - """Return the cross-encoder rerank budget before final trimming.""" - if candidate_count <= 0: - return max(1, int(requested_k or 1)) - - rerank_limit = max(1, int(requested_k or 1)) - if self._config is not None: - for attr_name in ("reranker_top_k", "reranking_top_k"): - configured_value = getattr(self._config, attr_name, None) - if isinstance(configured_value, bool): - continue - if isinstance(configured_value, (int, float)): - rerank_limit = max(rerank_limit, int(configured_value)) - - return max(1, min(candidate_count, rerank_limit)) - - def _resolve_stage3_target_count( - self, - requested_k: int, - candidate_count: int, - ) -> int: - """Return the number of Stage 3 representatives to preserve.""" - base_target = max(1, int(requested_k or 1)) * 2 - target_count = base_target - if self._config is not None and getattr( - self._config, - "enable_staged_rerank", - False, - ): - target_count = max( - target_count, - self._resolve_rerank_candidate_limit(requested_k, candidate_count), - ) - - return max(1, min(candidate_count, target_count)) - - def _search_symbols_parallel(self, index_paths: List[Path], - name: str, - kind: Optional[str], - limit: int) -> List[Symbol]: - """Search symbols across multiple indexes in parallel. - - Args: - index_paths: List of _index.db paths to search - name: Symbol name pattern - kind: Optional symbol kind filter - limit: Total symbol limit - - Returns: - Deduplicated and sorted symbols - """ - all_symbols = [] - - executor = self._get_executor() - # Submit all symbol search tasks - future_to_path = { - executor.submit( - self._search_symbols_single, - idx_path, - name, - kind - ): idx_path - for idx_path in index_paths - } - - # Collect results - for future in as_completed(future_to_path): - try: - symbols = future.result() - all_symbols.extend(symbols) - except Exception as exc: - self.logger.error(f"Symbol search failed: {exc}") - - # Deduplicate by (name, kind, range) - seen = set() - unique_symbols = [] - for sym in all_symbols: - key = (sym.name, sym.kind, sym.range) - if key not in seen: - seen.add(key) - unique_symbols.append(sym) - - # Sort by name - unique_symbols.sort(key=lambda s: s.name) - - return unique_symbols[:limit] - - def _search_symbols_single(self, index_path: Path, - name: str, - kind: Optional[str]) -> List[Symbol]: - """Search symbols in a single index. - - Args: - index_path: Path to _index.db file - name: Symbol name pattern - kind: Optional symbol kind filter - - Returns: - List of Symbol objects (empty on error) - """ - try: - with DirIndexStore(index_path) as store: - return store.search_symbols(name, kind=kind) - except Exception as exc: - self.logger.debug(f"Symbol search error in {index_path}: {exc}") - return [] - - -# === Convenience Functions === - -def quick_search(query: str, - source_path: Path, - depth: int = -1) -> List[SearchResult]: - """Quick search convenience function with automatic initialization. - - Creates temporary registry and mapper instances for one-off searches. - For repeated searches, create a ChainSearchEngine instance directly. - - Args: - query: FTS5 search query string - source_path: Starting directory path - depth: Maximum search depth (-1 = unlimited) - - Returns: - List of SearchResult objects sorted by relevance - - Examples: - >>> from pathlib import Path - >>> results = quick_search("authentication", Path("D:/project/src")) - >>> print(f"Found {len(results)} matches") - """ - registry = RegistryStore() - registry.initialize() - - mapper = PathMapper() - - with ChainSearchEngine(registry, mapper) as engine: - options = SearchOptions(depth=depth) - result = engine.search(query, source_path, options) - - registry.close() - - return result.results diff --git a/codex-lens/src/codexlens/search/clustering/__init__.py b/codex-lens/src/codexlens/search/clustering/__init__.py deleted file mode 100644 index d8161c98..00000000 --- a/codex-lens/src/codexlens/search/clustering/__init__.py +++ /dev/null @@ -1,124 +0,0 @@ -"""Clustering strategies for the staged hybrid search pipeline. - -This module provides extensible clustering infrastructure for grouping -similar search results and selecting representative results. - -Install with: pip install codexlens[clustering] - -Example: - >>> from codexlens.search.clustering import ( - ... CLUSTERING_AVAILABLE, - ... ClusteringConfig, - ... get_strategy, - ... ) - >>> config = ClusteringConfig(min_cluster_size=3) - >>> # Auto-select best available strategy with fallback - >>> strategy = get_strategy("auto", config) - >>> representatives = strategy.fit_predict(embeddings, results) - >>> - >>> # Or explicitly use a specific strategy - >>> if CLUSTERING_AVAILABLE: - ... from codexlens.search.clustering import HDBSCANStrategy - ... strategy = HDBSCANStrategy(config) - ... representatives = strategy.fit_predict(embeddings, results) -""" - -from __future__ import annotations - -# Always export base classes and factory (no heavy dependencies) -from .base import BaseClusteringStrategy, ClusteringConfig -from .factory import ( - ClusteringStrategyFactory, - check_clustering_strategy_available, - get_strategy, -) -from .noop_strategy import NoOpStrategy -from .frequency_strategy import FrequencyStrategy, FrequencyConfig - -# Feature flag for clustering availability (hdbscan + sklearn) -CLUSTERING_AVAILABLE = False -HDBSCAN_AVAILABLE = False -DBSCAN_AVAILABLE = False -_import_error: str | None = None - - -def _detect_clustering_available() -> tuple[bool, bool, bool, str | None]: - """Detect if clustering dependencies are available. - - Returns: - Tuple of (all_available, hdbscan_available, dbscan_available, error_message). - """ - hdbscan_ok = False - dbscan_ok = False - - try: - import hdbscan # noqa: F401 - hdbscan_ok = True - except ImportError: - pass - - try: - from sklearn.cluster import DBSCAN # noqa: F401 - dbscan_ok = True - except ImportError: - pass - - all_ok = hdbscan_ok and dbscan_ok - error = None - if not all_ok: - missing = [] - if not hdbscan_ok: - missing.append("hdbscan") - if not dbscan_ok: - missing.append("scikit-learn") - error = f"{', '.join(missing)} not available. Install with: pip install codexlens[clustering]" - - return all_ok, hdbscan_ok, dbscan_ok, error - - -# Initialize on module load -CLUSTERING_AVAILABLE, HDBSCAN_AVAILABLE, DBSCAN_AVAILABLE, _import_error = ( - _detect_clustering_available() -) - - -def check_clustering_available() -> tuple[bool, str | None]: - """Check if all clustering dependencies are available. - - Returns: - Tuple of (is_available, error_message). - error_message is None if available, otherwise contains install instructions. - """ - return CLUSTERING_AVAILABLE, _import_error - - -# Conditionally export strategy implementations -__all__ = [ - # Feature flags - "CLUSTERING_AVAILABLE", - "HDBSCAN_AVAILABLE", - "DBSCAN_AVAILABLE", - "check_clustering_available", - # Base classes - "BaseClusteringStrategy", - "ClusteringConfig", - # Factory - "ClusteringStrategyFactory", - "get_strategy", - "check_clustering_strategy_available", - # Always-available strategies - "NoOpStrategy", - "FrequencyStrategy", - "FrequencyConfig", -] - -# Conditionally add strategy classes to __all__ and module namespace -if HDBSCAN_AVAILABLE: - from .hdbscan_strategy import HDBSCANStrategy - - __all__.append("HDBSCANStrategy") - -if DBSCAN_AVAILABLE: - from .dbscan_strategy import DBSCANStrategy - - __all__.append("DBSCANStrategy") diff --git a/codex-lens/src/codexlens/search/clustering/base.py b/codex-lens/src/codexlens/search/clustering/base.py deleted file mode 100644 index 912a4fc6..00000000 --- a/codex-lens/src/codexlens/search/clustering/base.py +++ /dev/null @@ -1,153 +0,0 @@ -"""Base classes for clustering strategies in the hybrid search pipeline. - -This module defines the abstract base class for clustering strategies used -in the staged hybrid search pipeline. Strategies cluster search results -based on their embeddings and select representative results from each cluster. -""" - -from __future__ import annotations - -from abc import ABC, abstractmethod -from dataclasses import dataclass, field -from typing import TYPE_CHECKING, List, Optional - -if TYPE_CHECKING: - import numpy as np - from codexlens.entities import SearchResult - - -@dataclass -class ClusteringConfig: - """Configuration parameters for clustering strategies. - - Attributes: - min_cluster_size: Minimum number of results to form a cluster. - HDBSCAN default is 5, but for search results 2-3 is often better. - min_samples: Number of samples in a neighborhood for a point to be - considered a core point. Lower values allow more clusters. - metric: Distance metric for clustering. Common options: - - 'euclidean': Standard L2 distance - - 'cosine': Cosine distance (1 - cosine_similarity) - - 'manhattan': L1 distance - cluster_selection_epsilon: Distance threshold for cluster selection. - Results within this distance may be merged into the same cluster. - allow_single_cluster: If True, allow all results to form one cluster. - Useful when results are very similar. - prediction_data: If True, generate prediction data for new points. - """ - - min_cluster_size: int = 3 - min_samples: int = 2 - metric: str = "cosine" - cluster_selection_epsilon: float = 0.0 - allow_single_cluster: bool = True - prediction_data: bool = False - - def __post_init__(self) -> None: - """Validate configuration parameters.""" - if self.min_cluster_size < 2: - raise ValueError("min_cluster_size must be >= 2") - if self.min_samples < 1: - raise ValueError("min_samples must be >= 1") - if self.metric not in ("euclidean", "cosine", "manhattan"): - raise ValueError(f"metric must be one of: euclidean, cosine, manhattan; got {self.metric}") - if self.cluster_selection_epsilon < 0: - raise ValueError("cluster_selection_epsilon must be >= 0") - - -class BaseClusteringStrategy(ABC): - """Abstract base class for clustering strategies. - - Clustering strategies are used in the staged hybrid search pipeline to - group similar search results and select representative results from each - cluster, reducing redundancy while maintaining diversity. - - Subclasses must implement: - - cluster(): Group results into clusters based on embeddings - - select_representatives(): Choose best result(s) from each cluster - """ - - def __init__(self, config: Optional[ClusteringConfig] = None) -> None: - """Initialize the clustering strategy. - - Args: - config: Clustering configuration. Uses defaults if not provided. - """ - self.config = config or ClusteringConfig() - - @abstractmethod - def cluster( - self, - embeddings: "np.ndarray", - results: List["SearchResult"], - ) -> List[List[int]]: - """Cluster search results based on their embeddings. - - Args: - embeddings: NumPy array of shape (n_results, embedding_dim) - containing the embedding vectors for each result. - results: List of SearchResult objects corresponding to embeddings. - Used for additional metadata during clustering. - - Returns: - List of clusters, where each cluster is a list of indices - into the results list. Results not assigned to any cluster - (noise points) should be returned as single-element clusters. - - Example: - >>> strategy = HDBSCANStrategy() - >>> clusters = strategy.cluster(embeddings, results) - >>> # clusters = [[0, 2, 5], [1, 3], [4], [6, 7, 8]] - >>> # Result indices 0, 2, 5 are in cluster 0 - >>> # Result indices 1, 3 are in cluster 1 - >>> # Result index 4 is a noise point (singleton cluster) - >>> # Result indices 6, 7, 8 are in cluster 2 - """ - ... - - @abstractmethod - def select_representatives( - self, - clusters: List[List[int]], - results: List["SearchResult"], - embeddings: Optional["np.ndarray"] = None, - ) -> List["SearchResult"]: - """Select representative results from each cluster. - - This method chooses the best result(s) from each cluster to include - in the final search results. The selection can be based on: - - Highest score within cluster - - Closest to cluster centroid - - Custom selection logic - - Args: - clusters: List of clusters from cluster() method. - results: Original list of SearchResult objects. - embeddings: Optional embeddings array for centroid-based selection. - - Returns: - List of representative SearchResult objects, one or more per cluster, - ordered by relevance (highest score first). - - Example: - >>> representatives = strategy.select_representatives(clusters, results) - >>> # Returns best result from each cluster - """ - ... - - def fit_predict( - self, - embeddings: "np.ndarray", - results: List["SearchResult"], - ) -> List["SearchResult"]: - """Convenience method to cluster and select representatives in one call. - - Args: - embeddings: NumPy array of shape (n_results, embedding_dim). - results: List of SearchResult objects. - - Returns: - List of representative SearchResult objects. - """ - clusters = self.cluster(embeddings, results) - return self.select_representatives(clusters, results, embeddings) diff --git a/codex-lens/src/codexlens/search/clustering/dbscan_strategy.py b/codex-lens/src/codexlens/search/clustering/dbscan_strategy.py deleted file mode 100644 index 90588a91..00000000 --- a/codex-lens/src/codexlens/search/clustering/dbscan_strategy.py +++ /dev/null @@ -1,197 +0,0 @@ -"""DBSCAN-based clustering strategy for search results. - -DBSCAN (Density-Based Spatial Clustering of Applications with Noise) -is the fallback clustering strategy when HDBSCAN is not available. -""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, List, Optional - -from .base import BaseClusteringStrategy, ClusteringConfig - -if TYPE_CHECKING: - import numpy as np - from codexlens.entities import SearchResult - - -class DBSCANStrategy(BaseClusteringStrategy): - """DBSCAN-based clustering strategy. - - Uses sklearn's DBSCAN algorithm as a fallback when HDBSCAN is not available. - DBSCAN requires an explicit eps parameter, which is auto-computed from the - distance distribution if not provided. - - Example: - >>> from codexlens.search.clustering import DBSCANStrategy, ClusteringConfig - >>> config = ClusteringConfig(min_cluster_size=3, metric='cosine') - >>> strategy = DBSCANStrategy(config) - >>> clusters = strategy.cluster(embeddings, results) - >>> representatives = strategy.select_representatives(clusters, results) - """ - - # Default eps percentile for auto-computation - DEFAULT_EPS_PERCENTILE: float = 15.0 - - def __init__( - self, - config: Optional[ClusteringConfig] = None, - eps: Optional[float] = None, - eps_percentile: float = DEFAULT_EPS_PERCENTILE, - ) -> None: - """Initialize DBSCAN clustering strategy. - - Args: - config: Clustering configuration. Uses defaults if not provided. - eps: Explicit eps parameter for DBSCAN. If None, auto-computed - from the distance distribution. - eps_percentile: Percentile of pairwise distances to use for - auto-computing eps. Default is 15th percentile. - - Raises: - ImportError: If sklearn is not installed. - """ - super().__init__(config) - self.eps = eps - self.eps_percentile = eps_percentile - - # Validate sklearn is available - try: - from sklearn.cluster import DBSCAN # noqa: F401 - except ImportError as exc: - raise ImportError( - "scikit-learn package is required for DBSCANStrategy. " - "Install with: pip install codexlens[clustering]" - ) from exc - - def _compute_eps(self, embeddings: "np.ndarray") -> float: - """Auto-compute eps from pairwise distance distribution. - - Uses the specified percentile of pairwise distances as eps, - which typically captures local density well. - - Args: - embeddings: NumPy array of shape (n_results, embedding_dim). - - Returns: - Computed eps value. - """ - import numpy as np - from sklearn.metrics import pairwise_distances - - # Compute pairwise distances - distances = pairwise_distances(embeddings, metric=self.config.metric) - - # Get upper triangle (excluding diagonal) - upper_tri = distances[np.triu_indices_from(distances, k=1)] - - if len(upper_tri) == 0: - # Only one point, return a default small eps - return 0.1 - - # Use percentile of distances as eps - eps = float(np.percentile(upper_tri, self.eps_percentile)) - - # Ensure eps is positive - return max(eps, 1e-6) - - def cluster( - self, - embeddings: "np.ndarray", - results: List["SearchResult"], - ) -> List[List[int]]: - """Cluster search results using DBSCAN algorithm. - - Args: - embeddings: NumPy array of shape (n_results, embedding_dim) - containing the embedding vectors for each result. - results: List of SearchResult objects corresponding to embeddings. - - Returns: - List of clusters, where each cluster is a list of indices - into the results list. Noise points are returned as singleton clusters. - """ - from sklearn.cluster import DBSCAN - import numpy as np - - n_results = len(results) - if n_results == 0: - return [] - - # Handle edge case: single result - if n_results == 1: - return [[0]] - - # Determine eps value - eps = self.eps if self.eps is not None else self._compute_eps(embeddings) - - # Configure DBSCAN clusterer - # Note: DBSCAN min_samples corresponds to min_cluster_size concept - clusterer = DBSCAN( - eps=eps, - min_samples=self.config.min_samples, - metric=self.config.metric, - ) - - # Fit and get cluster labels - # Labels: -1 = noise, 0+ = cluster index - labels = clusterer.fit_predict(embeddings) - - # Group indices by cluster label - cluster_map: dict[int, list[int]] = {} - for idx, label in enumerate(labels): - if label not in cluster_map: - cluster_map[label] = [] - cluster_map[label].append(idx) - - # Build result: non-noise clusters first, then noise as singletons - clusters: List[List[int]] = [] - - # Add proper clusters (label >= 0) - for label in sorted(cluster_map.keys()): - if label >= 0: - clusters.append(cluster_map[label]) - - # Add noise points as singleton clusters (label == -1) - if -1 in cluster_map: - for idx in cluster_map[-1]: - clusters.append([idx]) - - return clusters - - def select_representatives( - self, - clusters: List[List[int]], - results: List["SearchResult"], - embeddings: Optional["np.ndarray"] = None, - ) -> List["SearchResult"]: - """Select representative results from each cluster. - - Selects the result with the highest score from each cluster. - - Args: - clusters: List of clusters from cluster() method. - results: Original list of SearchResult objects. - embeddings: Optional embeddings (not used in score-based selection). - - Returns: - List of representative SearchResult objects, one per cluster, - ordered by score (highest first). - """ - if not clusters or not results: - return [] - - representatives: List["SearchResult"] = [] - - for cluster_indices in clusters: - if not cluster_indices: - continue - - # Find the result with the highest score in this cluster - best_idx = max(cluster_indices, key=lambda i: results[i].score) - representatives.append(results[best_idx]) - - # Sort by score descending - representatives.sort(key=lambda r: r.score, reverse=True) - - return representatives diff --git a/codex-lens/src/codexlens/search/clustering/factory.py b/codex-lens/src/codexlens/search/clustering/factory.py deleted file mode 100644 index 6c7f5b6e..00000000 --- a/codex-lens/src/codexlens/search/clustering/factory.py +++ /dev/null @@ -1,202 +0,0 @@ -"""Factory for creating clustering strategies. - -Provides a unified interface for instantiating different clustering backends -with automatic fallback chain: hdbscan -> dbscan -> noop. -""" - -from __future__ import annotations - -from typing import Any, Optional - -from .base import BaseClusteringStrategy, ClusteringConfig -from .noop_strategy import NoOpStrategy - - -def check_clustering_strategy_available(strategy: str) -> tuple[bool, str | None]: - """Check whether a specific clustering strategy can be used. - - Args: - strategy: Strategy name to check. Options: - - "hdbscan": HDBSCAN clustering (requires hdbscan package) - - "dbscan": DBSCAN clustering (requires sklearn) - - "frequency": Frequency-based clustering (always available) - - "noop": No-op strategy (always available) - - Returns: - Tuple of (is_available, error_message). - error_message is None if available, otherwise contains install instructions. - """ - strategy = (strategy or "").strip().lower() - - if strategy == "hdbscan": - try: - import hdbscan # noqa: F401 - except ImportError: - return False, ( - "hdbscan package not available. " - "Install with: pip install codexlens[clustering]" - ) - return True, None - - if strategy == "dbscan": - try: - from sklearn.cluster import DBSCAN # noqa: F401 - except ImportError: - return False, ( - "scikit-learn package not available. " - "Install with: pip install codexlens[clustering]" - ) - return True, None - - if strategy == "frequency": - # Frequency strategy is always available (no external deps) - return True, None - - if strategy == "noop": - return True, None - - return False, ( - f"Invalid clustering strategy: {strategy}. " - "Must be 'hdbscan', 'dbscan', 'frequency', or 'noop'." - ) - - -def get_strategy( - strategy: str = "hdbscan", - config: Optional[ClusteringConfig] = None, - *, - fallback: bool = True, - **kwargs: Any, -) -> BaseClusteringStrategy: - """Factory function to create clustering strategy with fallback chain. - - The fallback chain is: hdbscan -> dbscan -> frequency -> noop - - Args: - strategy: Clustering strategy to use. Options: - - "hdbscan": HDBSCAN clustering (default, recommended) - - "dbscan": DBSCAN clustering (fallback) - - "frequency": Frequency-based clustering (groups by symbol occurrence) - - "noop": No-op strategy (returns all results ungrouped) - - "auto": Try hdbscan, then dbscan, then noop - config: Clustering configuration. Uses defaults if not provided. - For frequency strategy, pass FrequencyConfig for full control. - fallback: If True (default), automatically fall back to next strategy - in the chain when primary is unavailable. If False, raise ImportError - when requested strategy is unavailable. - **kwargs: Additional strategy-specific arguments. - For DBSCANStrategy: eps, eps_percentile - For FrequencyStrategy: group_by, min_frequency, etc. - - Returns: - BaseClusteringStrategy: Configured clustering strategy instance. - - Raises: - ValueError: If strategy is not recognized. - ImportError: If required dependencies are not installed and fallback=False. - - Example: - >>> from codexlens.search.clustering import get_strategy, ClusteringConfig - >>> config = ClusteringConfig(min_cluster_size=3) - >>> # Auto-select best available strategy - >>> strategy = get_strategy("auto", config) - >>> # Explicitly use HDBSCAN (will fall back if unavailable) - >>> strategy = get_strategy("hdbscan", config) - >>> # Use frequency-based strategy - >>> from codexlens.search.clustering import FrequencyConfig - >>> freq_config = FrequencyConfig(min_frequency=2, group_by="symbol") - >>> strategy = get_strategy("frequency", freq_config) - """ - strategy = (strategy or "").strip().lower() - - # Handle "auto" - try strategies in order - if strategy == "auto": - return _get_best_available_strategy(config, **kwargs) - - if strategy == "hdbscan": - ok, err = check_clustering_strategy_available("hdbscan") - if ok: - from .hdbscan_strategy import HDBSCANStrategy - return HDBSCANStrategy(config) - - if fallback: - # Try dbscan fallback - ok_dbscan, _ = check_clustering_strategy_available("dbscan") - if ok_dbscan: - from .dbscan_strategy import DBSCANStrategy - return DBSCANStrategy(config, **kwargs) - # Final fallback to noop - return NoOpStrategy(config) - - raise ImportError(err) - - if strategy == "dbscan": - ok, err = check_clustering_strategy_available("dbscan") - if ok: - from .dbscan_strategy import DBSCANStrategy - return DBSCANStrategy(config, **kwargs) - - if fallback: - # Fallback to noop - return NoOpStrategy(config) - - raise ImportError(err) - - if strategy == "frequency": - from .frequency_strategy import FrequencyStrategy, FrequencyConfig - # If config is ClusteringConfig but not FrequencyConfig, create default FrequencyConfig - if config is None or not isinstance(config, FrequencyConfig): - freq_config = FrequencyConfig(**kwargs) if kwargs else FrequencyConfig() - else: - freq_config = config - return FrequencyStrategy(freq_config) - - if strategy == "noop": - return NoOpStrategy(config) - - raise ValueError( - f"Unknown clustering strategy: {strategy}. " - "Supported strategies: 'hdbscan', 'dbscan', 'frequency', 'noop', 'auto'" - ) - - -def _get_best_available_strategy( - config: Optional[ClusteringConfig] = None, - **kwargs: Any, -) -> BaseClusteringStrategy: - """Get the best available clustering strategy. - - Tries strategies in order: hdbscan -> dbscan -> noop - - Args: - config: Clustering configuration. - **kwargs: Additional strategy-specific arguments. - - Returns: - Best available clustering strategy instance. - """ - # Try HDBSCAN first - ok, _ = check_clustering_strategy_available("hdbscan") - if ok: - from .hdbscan_strategy import HDBSCANStrategy - return HDBSCANStrategy(config) - - # Try DBSCAN second - ok, _ = check_clustering_strategy_available("dbscan") - if ok: - from .dbscan_strategy import DBSCANStrategy - return DBSCANStrategy(config, **kwargs) - - # Fallback to NoOp - return NoOpStrategy(config) - - -# Alias for backward compatibility -ClusteringStrategyFactory = type( - "ClusteringStrategyFactory", - (), - { - "get_strategy": staticmethod(get_strategy), - "check_available": staticmethod(check_clustering_strategy_available), - }, -) diff --git a/codex-lens/src/codexlens/search/clustering/frequency_strategy.py b/codex-lens/src/codexlens/search/clustering/frequency_strategy.py deleted file mode 100644 index 48ddb00b..00000000 --- a/codex-lens/src/codexlens/search/clustering/frequency_strategy.py +++ /dev/null @@ -1,263 +0,0 @@ -"""Frequency-based clustering strategy for search result deduplication. - -This strategy groups search results by symbol/method name and prunes based on -occurrence frequency. High-frequency symbols (frequently referenced methods) -are considered more important and retained, while low-frequency results -(potentially noise) can be filtered out. - -Use cases: -- Prioritize commonly called methods/functions -- Filter out one-off results that may be less relevant -- Deduplicate results pointing to the same symbol from different locations -""" - -from __future__ import annotations - -from collections import defaultdict -from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, Literal - -from .base import BaseClusteringStrategy, ClusteringConfig - -if TYPE_CHECKING: - import numpy as np - from codexlens.entities import SearchResult - - -@dataclass -class FrequencyConfig(ClusteringConfig): - """Configuration for frequency-based clustering strategy. - - Attributes: - group_by: Field to group results by for frequency counting. - - 'symbol': Group by symbol_name (default, for method/function dedup) - - 'file': Group by file path - - 'symbol_kind': Group by symbol type (function, class, etc.) - min_frequency: Minimum occurrence count to keep a result. - Results appearing less than this are considered noise and pruned. - max_representatives_per_group: Maximum results to keep per symbol group. - frequency_weight: How much to boost score based on frequency. - Final score = original_score * (1 + frequency_weight * log(frequency)) - keep_mode: How to handle low-frequency results. - - 'filter': Remove results below min_frequency - - 'demote': Keep but lower their score ranking - """ - - group_by: Literal["symbol", "file", "symbol_kind"] = "symbol" - min_frequency: int = 1 # 1 means keep all, 2+ filters singletons - max_representatives_per_group: int = 3 - frequency_weight: float = 0.1 # Boost factor for frequency - keep_mode: Literal["filter", "demote"] = "demote" - - def __post_init__(self) -> None: - """Validate configuration parameters.""" - # Skip parent validation since we don't use HDBSCAN params - if self.min_frequency < 1: - raise ValueError("min_frequency must be >= 1") - if self.max_representatives_per_group < 1: - raise ValueError("max_representatives_per_group must be >= 1") - if self.frequency_weight < 0: - raise ValueError("frequency_weight must be >= 0") - if self.group_by not in ("symbol", "file", "symbol_kind"): - raise ValueError(f"group_by must be one of: symbol, file, symbol_kind; got {self.group_by}") - if self.keep_mode not in ("filter", "demote"): - raise ValueError(f"keep_mode must be one of: filter, demote; got {self.keep_mode}") - - -class FrequencyStrategy(BaseClusteringStrategy): - """Frequency-based clustering strategy for search result deduplication. - - This strategy groups search results by symbol name (or file/kind) and: - 1. Counts how many times each symbol appears in results - 2. Higher frequency = more important (frequently referenced method) - 3. Filters or demotes low-frequency results - 4. Selects top representatives from each frequency group - - Unlike embedding-based strategies (HDBSCAN, DBSCAN), this strategy: - - Does NOT require embeddings (works with metadata only) - - Is very fast (O(n) complexity) - - Is deterministic (no random initialization) - - Works well for symbol-level deduplication - - Example: - >>> config = FrequencyConfig(min_frequency=2, group_by="symbol") - >>> strategy = FrequencyStrategy(config) - >>> # Results with symbol "authenticate" appearing 5 times - >>> # will be prioritized over "helper_func" appearing once - >>> representatives = strategy.fit_predict(embeddings, results) - """ - - def __init__(self, config: Optional[FrequencyConfig] = None) -> None: - """Initialize the frequency strategy. - - Args: - config: Frequency configuration. Uses defaults if not provided. - """ - self.config: FrequencyConfig = config or FrequencyConfig() - - def _get_group_key(self, result: "SearchResult") -> str: - """Extract grouping key from a search result. - - Args: - result: SearchResult to extract key from. - - Returns: - String key for grouping (symbol name, file path, or kind). - """ - if self.config.group_by == "symbol": - # Use symbol_name if available, otherwise fall back to file:line - symbol = getattr(result, "symbol_name", None) - if symbol: - return str(symbol) - # Fallback: use file path + start_line as pseudo-symbol - start_line = getattr(result, "start_line", 0) or 0 - return f"{result.path}:{start_line}" - - elif self.config.group_by == "file": - return str(result.path) - - elif self.config.group_by == "symbol_kind": - kind = getattr(result, "symbol_kind", None) - return str(kind) if kind else "unknown" - - return str(result.path) # Default fallback - - def cluster( - self, - embeddings: "np.ndarray", - results: List["SearchResult"], - ) -> List[List[int]]: - """Group search results by frequency of occurrence. - - Note: This method ignores embeddings and groups by metadata only. - The embeddings parameter is kept for interface compatibility. - - Args: - embeddings: Ignored (kept for interface compatibility). - results: List of SearchResult objects to cluster. - - Returns: - List of clusters (groups), where each cluster contains indices - of results with the same grouping key. Clusters are ordered by - frequency (highest frequency first). - """ - if not results: - return [] - - # Group results by key - groups: Dict[str, List[int]] = defaultdict(list) - for idx, result in enumerate(results): - key = self._get_group_key(result) - groups[key].append(idx) - - # Sort groups by frequency (descending) then by key (for stability) - sorted_groups = sorted( - groups.items(), - key=lambda x: (-len(x[1]), x[0]) # -frequency, then alphabetical - ) - - # Convert to list of clusters - clusters = [indices for _, indices in sorted_groups] - - return clusters - - def select_representatives( - self, - clusters: List[List[int]], - results: List["SearchResult"], - embeddings: Optional["np.ndarray"] = None, - ) -> List["SearchResult"]: - """Select representative results based on frequency and score. - - For each frequency group: - 1. If frequency < min_frequency: filter or demote based on keep_mode - 2. Sort by score within group - 3. Apply frequency boost to scores - 4. Select top N representatives - - Args: - clusters: List of clusters from cluster() method. - results: Original list of SearchResult objects. - embeddings: Optional embeddings (used for tie-breaking if provided). - - Returns: - List of representative SearchResult objects, ordered by - frequency-adjusted score (highest first). - """ - import math - - if not clusters or not results: - return [] - - representatives: List["SearchResult"] = [] - demoted: List["SearchResult"] = [] - - for cluster_indices in clusters: - if not cluster_indices: - continue - - frequency = len(cluster_indices) - - # Get results in this cluster, sorted by score - cluster_results = [results[i] for i in cluster_indices] - cluster_results.sort(key=lambda r: getattr(r, "score", 0.0), reverse=True) - - # Check frequency threshold - if frequency < self.config.min_frequency: - if self.config.keep_mode == "filter": - # Skip low-frequency results entirely - continue - else: # demote mode - # Keep but add to demoted list (lower priority) - for result in cluster_results[: self.config.max_representatives_per_group]: - demoted.append(result) - continue - - # Apply frequency boost and select top representatives - for result in cluster_results[: self.config.max_representatives_per_group]: - # Calculate frequency-boosted score - original_score = getattr(result, "score", 0.0) - # log(frequency + 1) to handle frequency=1 case smoothly - frequency_boost = 1.0 + self.config.frequency_weight * math.log(frequency + 1) - boosted_score = original_score * frequency_boost - - # Create new result with boosted score and frequency metadata - # Note: SearchResult might be immutable, so we preserve original - # and track boosted score in metadata - if hasattr(result, "metadata") and isinstance(result.metadata, dict): - result.metadata["frequency"] = frequency - result.metadata["frequency_boosted_score"] = boosted_score - - representatives.append(result) - - # Sort representatives by boosted score (or original score as fallback) - def get_sort_score(r: "SearchResult") -> float: - if hasattr(r, "metadata") and isinstance(r.metadata, dict): - return r.metadata.get("frequency_boosted_score", getattr(r, "score", 0.0)) - return getattr(r, "score", 0.0) - - representatives.sort(key=get_sort_score, reverse=True) - - # Add demoted results at the end - if demoted: - demoted.sort(key=lambda r: getattr(r, "score", 0.0), reverse=True) - representatives.extend(demoted) - - return representatives - - def fit_predict( - self, - embeddings: "np.ndarray", - results: List["SearchResult"], - ) -> List["SearchResult"]: - """Convenience method to cluster and select representatives in one call. - - Args: - embeddings: NumPy array (may be ignored for frequency-based clustering). - results: List of SearchResult objects. - - Returns: - List of representative SearchResult objects. - """ - clusters = self.cluster(embeddings, results) - return self.select_representatives(clusters, results, embeddings) diff --git a/codex-lens/src/codexlens/search/clustering/hdbscan_strategy.py b/codex-lens/src/codexlens/search/clustering/hdbscan_strategy.py deleted file mode 100644 index 3bd2e1c0..00000000 --- a/codex-lens/src/codexlens/search/clustering/hdbscan_strategy.py +++ /dev/null @@ -1,175 +0,0 @@ -"""HDBSCAN-based clustering strategy for search results. - -HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) -is the primary clustering strategy for grouping similar search results. -""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, List, Optional - -from .base import BaseClusteringStrategy, ClusteringConfig - -if TYPE_CHECKING: - import numpy as np - from codexlens.entities import SearchResult - - -class HDBSCANStrategy(BaseClusteringStrategy): - """HDBSCAN-based clustering strategy. - - Uses HDBSCAN algorithm to cluster search results based on embedding similarity. - HDBSCAN is preferred over DBSCAN because it: - - Automatically determines the number of clusters - - Handles varying density clusters well - - Identifies noise points (outliers) effectively - - Example: - >>> from codexlens.search.clustering import HDBSCANStrategy, ClusteringConfig - >>> config = ClusteringConfig(min_cluster_size=3, metric='cosine') - >>> strategy = HDBSCANStrategy(config) - >>> clusters = strategy.cluster(embeddings, results) - >>> representatives = strategy.select_representatives(clusters, results) - """ - - def __init__(self, config: Optional[ClusteringConfig] = None) -> None: - """Initialize HDBSCAN clustering strategy. - - Args: - config: Clustering configuration. Uses defaults if not provided. - - Raises: - ImportError: If hdbscan package is not installed. - """ - super().__init__(config) - # Validate hdbscan is available - try: - import hdbscan # noqa: F401 - except ImportError as exc: - raise ImportError( - "hdbscan package is required for HDBSCANStrategy. " - "Install with: pip install codexlens[clustering]" - ) from exc - - def cluster( - self, - embeddings: "np.ndarray", - results: List["SearchResult"], - ) -> List[List[int]]: - """Cluster search results using HDBSCAN algorithm. - - Args: - embeddings: NumPy array of shape (n_results, embedding_dim) - containing the embedding vectors for each result. - results: List of SearchResult objects corresponding to embeddings. - - Returns: - List of clusters, where each cluster is a list of indices - into the results list. Noise points are returned as singleton clusters. - """ - import hdbscan - import numpy as np - - n_results = len(results) - if n_results == 0: - return [] - - # Handle edge case: fewer results than min_cluster_size - if n_results < self.config.min_cluster_size: - # Return each result as its own singleton cluster - return [[i] for i in range(n_results)] - - metric = self.config.metric - data = embeddings - - # Some hdbscan builds do not recognize metric="cosine" even though it's a - # common need for embedding clustering. In that case, compute a precomputed - # cosine distance matrix and run HDBSCAN with metric="precomputed". - if metric == "cosine": - try: - from sklearn.metrics import pairwise_distances - - data = pairwise_distances(embeddings, metric="cosine") - # Some hdbscan builds are strict about dtype for precomputed distances. - # Ensure float64 to avoid Buffer dtype mismatch errors. - try: - data = data.astype("float64", copy=False) - except Exception: - pass - metric = "precomputed" - except Exception: - # If we cannot compute distances, fall back to euclidean over raw vectors. - metric = "euclidean" - - # Configure HDBSCAN clusterer - clusterer = hdbscan.HDBSCAN( - min_cluster_size=self.config.min_cluster_size, - min_samples=self.config.min_samples, - metric=metric, - cluster_selection_epsilon=self.config.cluster_selection_epsilon, - allow_single_cluster=self.config.allow_single_cluster, - prediction_data=self.config.prediction_data, - ) - - # Fit and get cluster labels - # Labels: -1 = noise, 0+ = cluster index - labels = clusterer.fit_predict(data) - - # Group indices by cluster label - cluster_map: dict[int, list[int]] = {} - for idx, label in enumerate(labels): - if label not in cluster_map: - cluster_map[label] = [] - cluster_map[label].append(idx) - - # Build result: non-noise clusters first, then noise as singletons - clusters: List[List[int]] = [] - - # Add proper clusters (label >= 0) - for label in sorted(cluster_map.keys()): - if label >= 0: - clusters.append(cluster_map[label]) - - # Add noise points as singleton clusters (label == -1) - if -1 in cluster_map: - for idx in cluster_map[-1]: - clusters.append([idx]) - - return clusters - - def select_representatives( - self, - clusters: List[List[int]], - results: List["SearchResult"], - embeddings: Optional["np.ndarray"] = None, - ) -> List["SearchResult"]: - """Select representative results from each cluster. - - Selects the result with the highest score from each cluster. - - Args: - clusters: List of clusters from cluster() method. - results: Original list of SearchResult objects. - embeddings: Optional embeddings (not used in score-based selection). - - Returns: - List of representative SearchResult objects, one per cluster, - ordered by score (highest first). - """ - if not clusters or not results: - return [] - - representatives: List["SearchResult"] = [] - - for cluster_indices in clusters: - if not cluster_indices: - continue - - # Find the result with the highest score in this cluster - best_idx = max(cluster_indices, key=lambda i: results[i].score) - representatives.append(results[best_idx]) - - # Sort by score descending - representatives.sort(key=lambda r: r.score, reverse=True) - - return representatives diff --git a/codex-lens/src/codexlens/search/clustering/noop_strategy.py b/codex-lens/src/codexlens/search/clustering/noop_strategy.py deleted file mode 100644 index eda36098..00000000 --- a/codex-lens/src/codexlens/search/clustering/noop_strategy.py +++ /dev/null @@ -1,83 +0,0 @@ -"""No-op clustering strategy for search results. - -NoOpStrategy returns all results ungrouped when clustering dependencies -are not available or clustering is disabled. -""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, List, Optional - -from .base import BaseClusteringStrategy, ClusteringConfig - -if TYPE_CHECKING: - import numpy as np - from codexlens.entities import SearchResult - - -class NoOpStrategy(BaseClusteringStrategy): - """No-op clustering strategy that returns all results ungrouped. - - This strategy is used as a final fallback when no clustering dependencies - are available, or when clustering is explicitly disabled. Each result - is treated as its own singleton cluster. - - Example: - >>> from codexlens.search.clustering import NoOpStrategy - >>> strategy = NoOpStrategy() - >>> clusters = strategy.cluster(embeddings, results) - >>> # Returns [[0], [1], [2], ...] - each result in its own cluster - >>> representatives = strategy.select_representatives(clusters, results) - >>> # Returns all results sorted by score - """ - - def __init__(self, config: Optional[ClusteringConfig] = None) -> None: - """Initialize NoOp clustering strategy. - - Args: - config: Clustering configuration. Ignored for NoOpStrategy - but accepted for interface compatibility. - """ - super().__init__(config) - - def cluster( - self, - embeddings: "np.ndarray", - results: List["SearchResult"], - ) -> List[List[int]]: - """Return each result as its own singleton cluster. - - Args: - embeddings: NumPy array of shape (n_results, embedding_dim). - Not used but accepted for interface compatibility. - results: List of SearchResult objects. - - Returns: - List of singleton clusters, one per result. - """ - return [[i] for i in range(len(results))] - - def select_representatives( - self, - clusters: List[List[int]], - results: List["SearchResult"], - embeddings: Optional["np.ndarray"] = None, - ) -> List["SearchResult"]: - """Return all results sorted by score. - - Since each cluster is a singleton, this effectively returns all - results sorted by score descending. - - Args: - clusters: List of singleton clusters. - results: Original list of SearchResult objects. - embeddings: Optional embeddings (not used). - - Returns: - All SearchResult objects sorted by score (highest first). - """ - if not results: - return [] - - # Return all results sorted by score - return sorted(results, key=lambda r: r.score, reverse=True) diff --git a/codex-lens/src/codexlens/search/enrichment.py b/codex-lens/src/codexlens/search/enrichment.py deleted file mode 100644 index 110f56b7..00000000 --- a/codex-lens/src/codexlens/search/enrichment.py +++ /dev/null @@ -1,171 +0,0 @@ -# codex-lens/src/codexlens/search/enrichment.py -"""Relationship enrichment for search results.""" -import sqlite3 -from pathlib import Path -from typing import List, Dict, Any, Optional - -from codexlens.config import Config -from codexlens.entities import SearchResult -from codexlens.search.graph_expander import GraphExpander -from codexlens.storage.path_mapper import PathMapper - - -class RelationshipEnricher: - """Enriches search results with code graph relationships.""" - - def __init__(self, index_path: Path): - """Initialize with path to index database. - - Args: - index_path: Path to _index.db SQLite database - """ - self.index_path = index_path - self.db_conn: Optional[sqlite3.Connection] = None - self._connect() - - def _connect(self) -> None: - """Establish read-only database connection.""" - if self.index_path.exists(): - self.db_conn = sqlite3.connect( - f"file:{self.index_path}?mode=ro", - uri=True, - check_same_thread=False - ) - self.db_conn.row_factory = sqlite3.Row - - def enrich(self, results: List[Dict[str, Any]], limit: int = 10) -> List[Dict[str, Any]]: - """Add relationship data to search results. - - Args: - results: List of search result dictionaries - limit: Maximum number of results to enrich - - Returns: - Results with relationships field added - """ - if not self.db_conn: - return results - - for result in results[:limit]: - file_path = result.get('file') or result.get('path') - symbol_name = result.get('symbol') - result['relationships'] = self._find_relationships(file_path, symbol_name) - return results - - def _find_relationships(self, file_path: Optional[str], symbol_name: Optional[str]) -> List[Dict[str, Any]]: - """Query relationships for a symbol. - - Args: - file_path: Path to file containing the symbol - symbol_name: Name of the symbol - - Returns: - List of relationship dictionaries with type, direction, target/source, file, line - """ - if not self.db_conn or not symbol_name: - return [] - - relationships = [] - cursor = self.db_conn.cursor() - - try: - # Find symbol ID(s) by name and optionally file - if file_path: - cursor.execute( - 'SELECT id FROM symbols WHERE name = ? AND file_path = ?', - (symbol_name, file_path) - ) - else: - cursor.execute('SELECT id FROM symbols WHERE name = ?', (symbol_name,)) - - symbol_ids = [row[0] for row in cursor.fetchall()] - - if not symbol_ids: - return [] - - # Query outgoing relationships (symbol is source) - placeholders = ','.join('?' * len(symbol_ids)) - cursor.execute(f''' - SELECT sr.relationship_type, sr.target_symbol_fqn, sr.file_path, sr.line - FROM symbol_relationships sr - WHERE sr.source_symbol_id IN ({placeholders}) - ''', symbol_ids) - - for row in cursor.fetchall(): - relationships.append({ - 'type': row[0], - 'direction': 'outgoing', - 'target': row[1], - 'file': row[2], - 'line': row[3], - }) - - # Query incoming relationships (symbol is target) - # Match against symbol name or qualified name patterns - cursor.execute(''' - SELECT sr.relationship_type, s.name AS source_name, sr.file_path, sr.line - FROM symbol_relationships sr - JOIN symbols s ON sr.source_symbol_id = s.id - WHERE sr.target_symbol_fqn = ? OR sr.target_symbol_fqn LIKE ? - ''', (symbol_name, f'%.{symbol_name}')) - - for row in cursor.fetchall(): - rel_type = row[0] - # Convert to incoming type - incoming_type = self._to_incoming_type(rel_type) - relationships.append({ - 'type': incoming_type, - 'direction': 'incoming', - 'source': row[1], - 'file': row[2], - 'line': row[3], - }) - - except sqlite3.Error: - return [] - - return relationships - - def _to_incoming_type(self, outgoing_type: str) -> str: - """Convert outgoing relationship type to incoming type. - - Args: - outgoing_type: The outgoing relationship type (e.g., 'calls', 'imports') - - Returns: - Corresponding incoming type (e.g., 'called_by', 'imported_by') - """ - type_map = { - 'calls': 'called_by', - 'imports': 'imported_by', - 'extends': 'extended_by', - } - return type_map.get(outgoing_type, f'{outgoing_type}_by') - - def close(self) -> None: - """Close database connection.""" - if self.db_conn: - self.db_conn.close() - self.db_conn = None - - def __enter__(self) -> 'RelationshipEnricher': - return self - - def __exit__(self, exc_type, exc_val, exc_tb) -> None: - self.close() - - -class SearchEnrichmentPipeline: - """Search post-processing pipeline (optional enrichments).""" - - def __init__(self, mapper: PathMapper, *, config: Optional[Config] = None) -> None: - self._config = config - self._graph_expander = GraphExpander(mapper, config=config) - - def expand_related_results(self, results: List[SearchResult]) -> List[SearchResult]: - """Expand base results with related symbols when enabled in config.""" - if self._config is None or not getattr(self._config, "enable_graph_expansion", False): - return [] - - depth = int(getattr(self._config, "graph_expansion_depth", 2) or 2) - return self._graph_expander.expand(results, depth=depth) diff --git a/codex-lens/src/codexlens/search/global_graph_expander.py b/codex-lens/src/codexlens/search/global_graph_expander.py deleted file mode 100644 index b6aa682e..00000000 --- a/codex-lens/src/codexlens/search/global_graph_expander.py +++ /dev/null @@ -1,250 +0,0 @@ -"""Global graph expansion for search results using cross-directory relationships. - -Expands top search results with related symbols by querying the global_relationships -table in GlobalSymbolIndex, enabling project-wide code graph traversal. -""" - -from __future__ import annotations - -import logging -import sqlite3 -from typing import Dict, List, Optional, Sequence, Tuple - -from codexlens.config import Config -from codexlens.entities import SearchResult -from codexlens.storage.global_index import GlobalSymbolIndex - -logger = logging.getLogger(__name__) - -# Score decay factors by relationship type. -# INHERITS has highest factor (strongest semantic link), -# IMPORTS next (explicit dependency), CALLS lowest (may be indirect). -DECAY_FACTORS: Dict[str, float] = { - "imports": 0.4, - "inherits": 0.5, - "calls": 0.3, -} -DEFAULT_DECAY = 0.3 - - -class GlobalGraphExpander: - """Expands search results with cross-directory related symbols from the global graph.""" - - def __init__( - self, - global_index: GlobalSymbolIndex, - *, - config: Optional[Config] = None, - ) -> None: - self._global_index = global_index - self._config = config - self._logger = logging.getLogger(__name__) - - def expand( - self, - results: Sequence[SearchResult], - *, - top_n: int = 10, - max_related: int = 50, - ) -> List[SearchResult]: - """Expand top-N results with related symbols from global relationships. - - Args: - results: Base ranked results from Stage 1. - top_n: Only expand the top-N base results. - max_related: Maximum related results to return. - - Returns: - List of related SearchResult objects (does NOT include the input results). - """ - if not results: - return [] - - # 1. Extract symbol names from top results - symbols_with_scores = self._resolve_symbols(results, top_n) - if not symbols_with_scores: - return [] - - symbol_names = [s[0] for s in symbols_with_scores] - base_scores = {s[0]: s[1] for s in symbols_with_scores} - - # 2. Query global relationships - relationships = self._query_relationships(symbol_names, limit=max_related * 3) - if not relationships: - return [] - - # 3. Build expanded results with score decay - expanded = self._build_expanded_results( - relationships, base_scores, max_related - ) - - # 4. Deduplicate against input results - input_keys: set[Tuple[str, Optional[str], Optional[int]]] = set() - for r in results: - input_keys.add((r.path, r.symbol_name, r.start_line)) - - deduped: List[SearchResult] = [] - seen: set[Tuple[str, Optional[str], Optional[int]]] = set() - for r in expanded: - key = (r.path, r.symbol_name, r.start_line) - if key not in input_keys and key not in seen: - seen.add(key) - deduped.append(r) - - return deduped[:max_related] - - def _resolve_symbols( - self, - results: Sequence[SearchResult], - top_n: int, - ) -> List[Tuple[str, float]]: - """Extract (symbol_name, score) pairs from top results.""" - symbols: List[Tuple[str, float]] = [] - seen: set[str] = set() - for r in list(results)[:top_n]: - name = r.symbol_name - if not name or name in seen: - continue - seen.add(name) - symbols.append((name, float(r.score))) - return symbols - - def _query_relationships( - self, - symbol_names: List[str], - limit: int = 150, - ) -> List[sqlite3.Row]: - """Query global_relationships for symbols.""" - try: - return self._global_index.query_relationships_for_symbols( - symbol_names, limit=limit - ) - except Exception as exc: - self._logger.debug("Global graph query failed: %s", exc) - return [] - - def _resolve_target_to_file( - self, - target_qualified_name: str, - ) -> Optional[Tuple[str, int, int]]: - """Resolve target_qualified_name to (file_path, start_line, end_line). - - Tries ``file_path::symbol_name`` format first, then falls back to - symbol name search in the global index. - """ - # Format: "file_path::symbol_name" - if "::" in target_qualified_name: - parts = target_qualified_name.split("::", 1) - target_file = parts[0] - target_symbol = parts[1] - try: - symbols = self._global_index.search(target_symbol, limit=5) - for sym in symbols: - if sym.file and str(sym.file) == target_file: - return ( - target_file, - sym.range[0] if sym.range else 1, - sym.range[1] if sym.range else 1, - ) - # File path known but line info unavailable - return (target_file, 1, 1) - except Exception: - return (target_file, 1, 1) - - # Plain symbol name (possibly dot-qualified like "mod.ClassName") - try: - leaf_name = target_qualified_name.rsplit(".", 1)[-1] - symbols = self._global_index.search(leaf_name, limit=5) - if symbols: - sym = symbols[0] - file_path = str(sym.file) if sym.file else None - if file_path: - return ( - file_path, - sym.range[0] if sym.range else 1, - sym.range[1] if sym.range else 1, - ) - except Exception: - pass - - return None - - def _build_expanded_results( - self, - relationships: List[sqlite3.Row], - base_scores: Dict[str, float], - max_related: int, - ) -> List[SearchResult]: - """Build SearchResult list from relationships with score decay.""" - results: List[SearchResult] = [] - - for rel in relationships: - source_file = rel["source_file"] - source_symbol = rel["source_symbol"] - target_qname = rel["target_qualified_name"] - rel_type = rel["relationship_type"] - source_line = rel["source_line"] - - # Determine base score from the matched symbol - base_score = base_scores.get(source_symbol, 0.0) - if base_score == 0.0: - # Try matching against the target leaf name - leaf = target_qname.rsplit(".", 1)[-1] if "." in target_qname else target_qname - if "::" in leaf: - leaf = leaf.split("::")[-1] - base_score = base_scores.get(leaf, 0.0) - if base_score == 0.0: - base_score = 0.5 # Default when no match found - - # Apply decay factor - decay = DECAY_FACTORS.get(rel_type, DEFAULT_DECAY) - score = base_score * decay - - # Try to resolve target to file for a richer result - target_info = self._resolve_target_to_file(target_qname) - if target_info: - t_file, t_start, t_end = target_info - results.append(SearchResult( - path=t_file, - score=score, - excerpt=None, - content=None, - start_line=t_start, - end_line=t_end, - symbol_name=( - target_qname.split("::")[-1] - if "::" in target_qname - else target_qname.rsplit(".", 1)[-1] - ), - symbol_kind=None, - metadata={ - "source": "static_graph", - "relationship_type": rel_type, - "from_symbol": source_symbol, - "from_file": source_file, - }, - )) - else: - # Use source file as fallback (we know the source exists) - results.append(SearchResult( - path=source_file, - score=score * 0.8, # Slight penalty for unresolved target - excerpt=None, - content=None, - start_line=source_line, - end_line=source_line, - symbol_name=source_symbol, - symbol_kind=None, - metadata={ - "source": "static_graph", - "relationship_type": rel_type, - "target_qualified_name": target_qname, - }, - )) - - if len(results) >= max_related: - break - - # Sort by score descending - results.sort(key=lambda r: r.score, reverse=True) - return results diff --git a/codex-lens/src/codexlens/search/graph_expander.py b/codex-lens/src/codexlens/search/graph_expander.py deleted file mode 100644 index 73261d53..00000000 --- a/codex-lens/src/codexlens/search/graph_expander.py +++ /dev/null @@ -1,264 +0,0 @@ -"""Graph expansion for search results using precomputed neighbors. - -Expands top search results with related symbol definitions by traversing -precomputed N-hop neighbors stored in the per-directory index databases. -""" - -from __future__ import annotations - -import logging -import sqlite3 -from pathlib import Path -from typing import Dict, List, Optional, Sequence, Tuple - -from codexlens.config import Config -from codexlens.entities import SearchResult -from codexlens.storage.path_mapper import PathMapper - -logger = logging.getLogger(__name__) - - -def _result_key(result: SearchResult) -> Tuple[str, Optional[str], Optional[int], Optional[int]]: - return (result.path, result.symbol_name, result.start_line, result.end_line) - - -def _slice_content_block(content: str, start_line: Optional[int], end_line: Optional[int]) -> Optional[str]: - if content is None: - return None - if start_line is None or end_line is None: - return None - if start_line < 1 or end_line < start_line: - return None - - lines = content.splitlines() - start_idx = max(0, start_line - 1) - end_idx = min(len(lines), end_line) - if start_idx >= len(lines): - return None - return "\n".join(lines[start_idx:end_idx]) - - -class GraphExpander: - """Expands SearchResult lists with related symbols from the code graph.""" - - def __init__(self, mapper: PathMapper, *, config: Optional[Config] = None) -> None: - self._mapper = mapper - self._config = config - self._logger = logging.getLogger(__name__) - - def expand( - self, - results: Sequence[SearchResult], - *, - depth: Optional[int] = None, - max_expand: int = 10, - max_related: int = 50, - ) -> List[SearchResult]: - """Expand top results with related symbols. - - Args: - results: Base ranked results. - depth: Maximum relationship depth to include (defaults to Config or 2). - max_expand: Only expand the top-N base results to bound cost. - max_related: Maximum related results to return. - - Returns: - A list of related SearchResult objects with relationship_depth metadata. - """ - if not results: - return [] - - configured_depth = getattr(self._config, "graph_expansion_depth", 2) if self._config else 2 - max_depth = int(depth if depth is not None else configured_depth) - if max_depth <= 0: - return [] - max_depth = min(max_depth, 2) - - expand_count = max(0, int(max_expand)) - related_limit = max(0, int(max_related)) - if expand_count == 0 or related_limit == 0: - return [] - - seen = {_result_key(r) for r in results} - related_results: List[SearchResult] = [] - conn_cache: Dict[Path, sqlite3.Connection] = {} - - try: - for base in list(results)[:expand_count]: - if len(related_results) >= related_limit: - break - - if not base.symbol_name or not base.path: - continue - - index_path = self._mapper.source_to_index_db(Path(base.path).parent) - conn = conn_cache.get(index_path) - if conn is None: - conn = self._connect_readonly(index_path) - if conn is None: - continue - conn_cache[index_path] = conn - - source_ids = self._resolve_source_symbol_ids( - conn, - file_path=base.path, - symbol_name=base.symbol_name, - symbol_kind=base.symbol_kind, - ) - if not source_ids: - continue - - for source_id in source_ids: - neighbors = self._get_neighbors(conn, source_id, max_depth=max_depth, limit=related_limit) - for neighbor_id, rel_depth in neighbors: - if len(related_results) >= related_limit: - break - row = self._get_symbol_details(conn, neighbor_id) - if row is None: - continue - - path = str(row["full_path"]) - symbol_name = str(row["name"]) - symbol_kind = str(row["kind"]) - start_line = int(row["start_line"]) if row["start_line"] is not None else None - end_line = int(row["end_line"]) if row["end_line"] is not None else None - content_block = _slice_content_block( - str(row["content"]) if row["content"] is not None else "", - start_line, - end_line, - ) - - score = float(base.score) * (0.5 ** int(rel_depth)) - candidate = SearchResult( - path=path, - score=max(0.0, score), - excerpt=None, - content=content_block, - start_line=start_line, - end_line=end_line, - symbol_name=symbol_name, - symbol_kind=symbol_kind, - metadata={"relationship_depth": int(rel_depth)}, - ) - - key = _result_key(candidate) - if key in seen: - continue - seen.add(key) - related_results.append(candidate) - - finally: - for conn in conn_cache.values(): - try: - conn.close() - except Exception: - pass - - return related_results - - def _connect_readonly(self, index_path: Path) -> Optional[sqlite3.Connection]: - try: - if not index_path.exists() or index_path.stat().st_size == 0: - return None - except OSError: - return None - - try: - conn = sqlite3.connect(f"file:{index_path}?mode=ro", uri=True, check_same_thread=False) - conn.row_factory = sqlite3.Row - return conn - except Exception as exc: - self._logger.debug("GraphExpander failed to open %s: %s", index_path, exc) - return None - - def _resolve_source_symbol_ids( - self, - conn: sqlite3.Connection, - *, - file_path: str, - symbol_name: str, - symbol_kind: Optional[str], - ) -> List[int]: - try: - if symbol_kind: - rows = conn.execute( - """ - SELECT s.id - FROM symbols s - JOIN files f ON f.id = s.file_id - WHERE f.full_path = ? AND s.name = ? AND s.kind = ? - """, - (file_path, symbol_name, symbol_kind), - ).fetchall() - else: - rows = conn.execute( - """ - SELECT s.id - FROM symbols s - JOIN files f ON f.id = s.file_id - WHERE f.full_path = ? AND s.name = ? - """, - (file_path, symbol_name), - ).fetchall() - except sqlite3.Error: - return [] - - ids: List[int] = [] - for row in rows: - try: - ids.append(int(row["id"])) - except Exception: - continue - return ids - - def _get_neighbors( - self, - conn: sqlite3.Connection, - source_symbol_id: int, - *, - max_depth: int, - limit: int, - ) -> List[Tuple[int, int]]: - try: - rows = conn.execute( - """ - SELECT neighbor_symbol_id, relationship_depth - FROM graph_neighbors - WHERE source_symbol_id = ? AND relationship_depth <= ? - ORDER BY relationship_depth ASC, neighbor_symbol_id ASC - LIMIT ? - """, - (int(source_symbol_id), int(max_depth), int(limit)), - ).fetchall() - except sqlite3.Error: - return [] - - neighbors: List[Tuple[int, int]] = [] - for row in rows: - try: - neighbors.append((int(row["neighbor_symbol_id"]), int(row["relationship_depth"]))) - except Exception: - continue - return neighbors - - def _get_symbol_details(self, conn: sqlite3.Connection, symbol_id: int) -> Optional[sqlite3.Row]: - try: - return conn.execute( - """ - SELECT - s.id, - s.name, - s.kind, - s.start_line, - s.end_line, - f.full_path, - f.content - FROM symbols s - JOIN files f ON f.id = s.file_id - WHERE s.id = ? - """, - (int(symbol_id),), - ).fetchone() - except sqlite3.Error: - return None - diff --git a/codex-lens/src/codexlens/search/hybrid_search.py b/codex-lens/src/codexlens/search/hybrid_search.py deleted file mode 100644 index 9a300069..00000000 --- a/codex-lens/src/codexlens/search/hybrid_search.py +++ /dev/null @@ -1,1359 +0,0 @@ -"""Hybrid search engine orchestrating parallel exact/fuzzy/vector searches with RRF fusion. - -Coordinates multiple search backends in parallel using ThreadPoolExecutor and combines -results via Reciprocal Rank Fusion (RRF) algorithm. -""" - -from __future__ import annotations - -import logging -import threading -import time -from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError, as_completed -from contextlib import contextmanager -from pathlib import Path -from typing import Any, Dict, List, Optional - - -@contextmanager -def timer(name: str, logger: logging.Logger, level: int = logging.DEBUG): - """Context manager for timing code blocks. - - Args: - name: Name of the operation being timed - logger: Logger instance to use - level: Logging level (default DEBUG) - """ - start = time.perf_counter() - try: - yield - finally: - elapsed_ms = (time.perf_counter() - start) * 1000 - logger.log(level, "[TIMING] %s: %.2fms", name, elapsed_ms) - -from codexlens.config import Config -from codexlens.config import VECTORS_HNSW_NAME -from codexlens.entities import SearchResult -from codexlens.search.ranking import ( - DEFAULT_WEIGHTS as RANKING_DEFAULT_WEIGHTS, - QueryIntent, - apply_symbol_boost, - cross_encoder_rerank, - detect_query_intent, - filter_results_by_category, - get_rrf_weights, - query_prefers_lexical_search, - reciprocal_rank_fusion, - rerank_results, - simple_weighted_fusion, - tag_search_source, -) -from codexlens.storage.dir_index import DirIndexStore -from codexlens.storage.index_filters import filter_index_paths - -# Optional LSP imports (for real-time graph expansion) -try: - from codexlens.lsp import LspBridge, LspGraphBuilder - HAS_LSP = True -except ImportError: - HAS_LSP = False - - -class HybridSearchEngine: - """Hybrid search engine with parallel execution and RRF fusion. - - Orchestrates searches across exact FTS, fuzzy FTS, and optional vector backends, - executing them in parallel and fusing results via Reciprocal Rank Fusion. - - Attributes: - logger: Python logger instance - default_weights: Default RRF weights for each source - """ - - # Public compatibility contract for callers/tests that expect the legacy - # three-backend defaults on the engine instance. - DEFAULT_WEIGHTS = { - "exact": 0.3, - "fuzzy": 0.1, - "vector": 0.6, - } - - def __init__( - self, - weights: Optional[Dict[str, float]] = None, - config: Optional[Config] = None, - embedder: Any = None, - ): - """Initialize hybrid search engine. - - Args: - weights: Optional custom RRF weights (default: DEFAULT_WEIGHTS) - config: Optional runtime config (enables optional reranking features) - embedder: Optional embedder instance for embedding-based reranking - - Raises: - TypeError: If weights is not a dict (e.g., if a Path is passed) - """ - self.logger = logging.getLogger(__name__) - - # Validate weights type to catch common usage errors - if weights is not None and not isinstance(weights, dict): - raise TypeError( - f"weights must be a dict, got {type(weights).__name__}. " - f"Did you mean to pass index_path to search() instead of __init__()?" - ) - - self.weights = weights - self._config = config - self.embedder = embedder - self.reranker: Any = None - self._use_gpu = config.embedding_use_gpu if config else True - self._centralized_cache_lock = threading.RLock() - self._centralized_model_config_cache: Dict[str, Any] = {} - self._centralized_embedder_cache: Dict[tuple[Any, ...], Any] = {} - self._centralized_ann_cache: Dict[tuple[str, int], Any] = {} - self._centralized_query_embedding_cache: Dict[tuple[Any, ...], Any] = {} - - @property - def weights(self) -> Dict[str, float]: - """Public/default weights exposed for backwards compatibility.""" - return dict(self._weights) - - @weights.setter - def weights(self, value: Optional[Dict[str, float]]) -> None: - """Update public and internal fusion weights together.""" - if value is None: - public_weights = self.DEFAULT_WEIGHTS.copy() - fusion_weights = dict(RANKING_DEFAULT_WEIGHTS) - fusion_weights.update(public_weights) - else: - if not isinstance(value, dict): - raise TypeError(f"weights must be a dict, got {type(value).__name__}") - public_weights = dict(value) - fusion_weights = dict(value) - - self._weights = public_weights - self._fusion_weights = fusion_weights - - @staticmethod - def _clamp_search_score(score: float) -> float: - """Keep ANN-derived similarity scores within SearchResult's valid domain.""" - - return max(0.0, float(score)) - - def _get_centralized_model_config(self, index_root: Path) -> Optional[Dict[str, Any]]: - """Load and cache the centralized embedding model config for an index root.""" - root_key = str(Path(index_root).resolve()) - - with self._centralized_cache_lock: - if root_key in self._centralized_model_config_cache: - cached = self._centralized_model_config_cache[root_key] - return dict(cached) if isinstance(cached, dict) else None - - model_config: Optional[Dict[str, Any]] = None - try: - from codexlens.semantic.vector_store import VectorStore - - central_index_path = Path(root_key) / "_index.db" - if central_index_path.exists(): - with VectorStore(central_index_path) as vs: - loaded = vs.get_model_config() - if isinstance(loaded, dict): - model_config = dict(loaded) - self.logger.debug( - "Loaded model config from centralized index: %s", - model_config, - ) - except Exception as exc: - self.logger.debug( - "Failed to load model config from centralized index: %s", - exc, - ) - - with self._centralized_cache_lock: - self._centralized_model_config_cache[root_key] = ( - dict(model_config) if isinstance(model_config, dict) else None - ) - - return dict(model_config) if isinstance(model_config, dict) else None - - def _get_centralized_embedder( - self, - model_config: Optional[Dict[str, Any]], - ) -> tuple[Any, int, tuple[Any, ...]]: - """Resolve and cache the embedder used for centralized vector search.""" - from codexlens.semantic.factory import get_embedder - - backend = "fastembed" - model_name: Optional[str] = None - model_profile = "code" - use_gpu = bool(self._use_gpu) - embedding_dim: Optional[int] = None - - if model_config: - backend = str(model_config.get("backend", "fastembed") or "fastembed") - model_name = model_config.get("model_name") - model_profile = str(model_config.get("model_profile", "code") or "code") - raw_dim = model_config.get("embedding_dim") - embedding_dim = int(raw_dim) if raw_dim else None - - if backend == "litellm": - embedder_key: tuple[Any, ...] = ("litellm", model_name or "", None) - else: - embedder_key = ("fastembed", model_profile, use_gpu) - - with self._centralized_cache_lock: - cached = self._centralized_embedder_cache.get(embedder_key) - if cached is None: - if backend == "litellm": - cached = get_embedder(backend="litellm", model=model_name) - else: - cached = get_embedder( - backend="fastembed", - profile=model_profile, - use_gpu=use_gpu, - ) - with self._centralized_cache_lock: - existing = self._centralized_embedder_cache.get(embedder_key) - if existing is None: - self._centralized_embedder_cache[embedder_key] = cached - else: - cached = existing - - if embedding_dim is None: - embedding_dim = int(getattr(cached, "embedding_dim", 0) or 0) - - return cached, embedding_dim, embedder_key - - def _get_centralized_ann_index(self, index_root: Path, dim: int) -> Any: - """Load and cache a centralized ANN index for repeated searches.""" - from codexlens.semantic.ann_index import ANNIndex - - resolved_root = Path(index_root).resolve() - cache_key = (str(resolved_root), int(dim)) - - with self._centralized_cache_lock: - cached = self._centralized_ann_cache.get(cache_key) - if cached is not None: - return cached - - ann_index = ANNIndex.create_central(index_root=resolved_root, dim=int(dim)) - if not ann_index.load(): - return None - - with self._centralized_cache_lock: - existing = self._centralized_ann_cache.get(cache_key) - if existing is None: - self._centralized_ann_cache[cache_key] = ann_index - return ann_index - return existing - - def _get_cached_query_embedding( - self, - query: str, - embedder: Any, - embedder_key: tuple[Any, ...], - ) -> Any: - """Cache repeated query embeddings for the same embedder settings.""" - cache_key = embedder_key + (query,) - - with self._centralized_cache_lock: - cached = self._centralized_query_embedding_cache.get(cache_key) - if cached is not None: - return cached - - query_embedding = embedder.embed_single(query) - with self._centralized_cache_lock: - existing = self._centralized_query_embedding_cache.get(cache_key) - if existing is None: - self._centralized_query_embedding_cache[cache_key] = query_embedding - return query_embedding - return existing - - def search( - self, - index_path: Path, - query: str, - limit: int = 20, - enable_fuzzy: bool = True, - enable_vector: bool = False, - pure_vector: bool = False, - enable_lsp_graph: bool = False, - lsp_max_depth: int = 1, - lsp_max_nodes: int = 20, - ) -> List[SearchResult]: - """Execute hybrid search with parallel retrieval and RRF fusion. - - Args: - index_path: Path to _index.db file - query: FTS5 query string (for FTS) or natural language query (for vector) - limit: Maximum results to return after fusion - enable_fuzzy: Enable fuzzy FTS search (default True) - enable_vector: Enable vector search (default False) - pure_vector: If True, only use vector search without FTS fallback (default False) - enable_lsp_graph: If True, enable real-time LSP graph expansion (default False) - lsp_max_depth: Maximum depth for LSP graph BFS expansion (default 1) - lsp_max_nodes: Maximum nodes to collect in LSP graph (default 20) - - Returns: - List of SearchResult objects sorted by fusion score - - Examples: - >>> engine = HybridSearchEngine() - >>> # Hybrid search (exact + fuzzy + vector) - >>> results = engine.search(Path("project/_index.db"), "authentication", - ... enable_vector=True) - >>> # Pure vector search (semantic only) - >>> results = engine.search(Path("project/_index.db"), - ... "how to authenticate users", - ... enable_vector=True, pure_vector=True) - >>> # With LSP graph expansion (real-time) - >>> results = engine.search(Path("project/_index.db"), "auth flow", - ... enable_vector=True, enable_lsp_graph=True) - >>> for r in results[:5]: - ... print(f"{r.path}: {r.score:.3f}") - """ - # Defensive: avoid creating/locking an index database when callers pass - # an empty placeholder file (common in tests and misconfigured callers). - try: - if index_path.exists() and index_path.stat().st_size == 0: - return [] - except OSError: - return [] - - # Detect query intent early for category filtering at index level - query_intent = detect_query_intent(query) - lexical_priority_query = query_prefers_lexical_search(query) - # Map intent to category for vector search: - # - KEYWORD (code intent) -> filter to 'code' only - # - SEMANTIC (doc intent) -> no filter (allow docs to surface) - # - MIXED -> no filter (allow all) - vector_category: Optional[str] = None - if query_intent == QueryIntent.KEYWORD: - vector_category = "code" - - # Determine which backends to use - backends = {} - - if pure_vector: - # Pure vector mode: only use vector search, no FTS fallback - if enable_vector: - backends["vector"] = True - else: - # Invalid configuration: pure_vector=True but enable_vector=False - self.logger.warning( - "pure_vector=True requires enable_vector=True. " - "Falling back to exact search. " - "To use pure vector search, enable vector search mode." - ) - backends["exact"] = True - else: - # Standard hybrid mode: FTS + optional vector - backends["exact"] = True - if enable_fuzzy: - backends["fuzzy"] = True - if enable_vector and not lexical_priority_query: - backends["vector"] = True - - # Add LSP graph expansion if requested and available - if enable_lsp_graph and HAS_LSP and not lexical_priority_query: - backends["lsp_graph"] = True - elif enable_lsp_graph and not HAS_LSP: - self.logger.warning( - "LSP graph search requested but dependencies not available. " - "Install: pip install aiohttp" - ) - - # Execute parallel searches - with timer("parallel_search_total", self.logger): - results_map = self._search_parallel( - index_path, query, backends, limit, vector_category, - lsp_max_depth, lsp_max_nodes - ) - - # Provide helpful message if pure-vector mode returns no results - if pure_vector and enable_vector and len(results_map.get("vector", [])) == 0: - self.logger.warning( - "Pure vector search returned no results. " - "This usually means embeddings haven't been generated. " - "Run: codexlens embeddings-generate %s", - index_path.parent if index_path.name == "_index.db" else index_path - ) - - # Apply RRF fusion - # Filter weights to only active backends - active_weights = { - source: weight - for source, weight in self._fusion_weights.items() - if source in results_map - } - - # Determine fusion method from config (default: rrf) - fusion_method = "rrf" - rrf_k = 60 - if self._config is not None: - fusion_method = getattr(self._config, "fusion_method", "rrf") or "rrf" - rrf_k = getattr(self._config, "rrf_k", 60) or 60 - - with timer("fusion", self.logger): - adaptive_weights = get_rrf_weights(query, active_weights) - if fusion_method == "simple": - fused_results = simple_weighted_fusion(results_map, adaptive_weights) - else: - # Default to RRF - fused_results = reciprocal_rank_fusion( - results_map, adaptive_weights, k=rrf_k - ) - - # Optional: boost results that include explicit symbol matches - boost_factor = ( - self._config.symbol_boost_factor - if self._config is not None - else 1.5 - ) - with timer("symbol_boost", self.logger): - fused_results = apply_symbol_boost( - fused_results, boost_factor=boost_factor - ) - - # Optional: embedding-based reranking on top results - if ( - self._config is not None - and self._config.enable_reranking - and not lexical_priority_query - ): - with timer("reranking", self.logger): - if self.embedder is None: - with self._centralized_cache_lock: - if self.embedder is None: - self.embedder = self._get_reranking_embedder() - fused_results = rerank_results( - query, - fused_results[:100], - self.embedder, - top_k=( - 100 - if self._config.enable_cross_encoder_rerank - else self._config.reranking_top_k - ), - ) - - # Optional: cross-encoder reranking as a second stage - if ( - self._config is not None - and self._config.enable_reranking - and self._config.enable_cross_encoder_rerank - and not lexical_priority_query - ): - with timer("cross_encoder_rerank", self.logger): - if self.reranker is None: - with self._centralized_cache_lock: - if self.reranker is None: - self.reranker = self._get_cross_encoder_reranker() - if self.reranker is not None: - fused_results = cross_encoder_rerank( - query, - fused_results, - self.reranker, - top_k=self._config.reranker_top_k, - ) - - # Apply category filtering to avoid code/doc pollution - # This ensures KEYWORD queries return code files, SEMANTIC queries prefer docs - enable_category_filter = ( - self._config is None - or getattr(self._config, 'enable_category_filter', True) - ) - if enable_category_filter and not pure_vector: - with timer("category_filter", self.logger): - query_intent = detect_query_intent(query) - fused_results = filter_results_by_category( - fused_results, query_intent, allow_mixed=True - ) - - # Apply final limit - return fused_results[:limit] - - def _get_reranking_embedder(self) -> Any: - """Create an embedder for reranking based on Config embedding settings.""" - if self._config is None: - return None - - try: - from codexlens.semantic.factory import get_embedder - except Exception as exc: - self.logger.debug("Reranking embedder unavailable: %s", exc) - return None - - try: - if self._config.embedding_backend == "fastembed": - return get_embedder( - backend="fastembed", - profile=self._config.embedding_model, - use_gpu=self._config.embedding_use_gpu, - ) - if self._config.embedding_backend == "litellm": - return get_embedder( - backend="litellm", - model=self._config.embedding_model, - endpoints=self._config.embedding_endpoints, - strategy=self._config.embedding_strategy, - cooldown=self._config.embedding_cooldown, - ) - except Exception as exc: - self.logger.debug("Failed to initialize reranking embedder: %s", exc) - return None - - self.logger.debug( - "Unknown embedding backend for reranking: %s", - self._config.embedding_backend, - ) - return None - - def _get_cross_encoder_reranker(self) -> Any: - if self._config is None: - return None - - try: - from codexlens.semantic.reranker import ( - check_reranker_available, - get_reranker, - ) - except Exception as exc: - self.logger.debug("Reranker factory unavailable: %s", exc) - return None - - backend = (getattr(self._config, "reranker_backend", "") or "").strip().lower() or "onnx" - - ok, err = check_reranker_available(backend) - if not ok: - self.logger.debug( - "Reranker backend unavailable (backend=%s): %s", - backend, - err, - ) - return None - - try: - model_name = (getattr(self._config, "reranker_model", "") or "").strip() or None - - if backend != "legacy" and model_name == "cross-encoder/ms-marco-MiniLM-L-6-v2": - model_name = None - - device: str | None = None - kwargs: dict[str, Any] = {} - reranker_use_gpu = bool( - getattr( - self._config, - "reranker_use_gpu", - getattr(self._config, "embedding_use_gpu", True), - ) - ) - - if backend == "onnx": - kwargs["use_gpu"] = reranker_use_gpu - elif backend == "legacy": - if not reranker_use_gpu: - device = "cpu" - elif backend == "api": - # Pass max_input_tokens for adaptive batching - max_tokens = getattr(self._config, "reranker_max_input_tokens", None) - if max_tokens: - kwargs["max_input_tokens"] = max_tokens - - return get_reranker( - backend=backend, - model_name=model_name, - device=device, - **kwargs, - ) - except Exception as exc: - self.logger.debug( - "Failed to initialize reranker (backend=%s): %s", - backend, - exc, - ) - return None - - def _search_parallel( - self, - index_path: Path, - query: str, - backends: Dict[str, bool], - limit: int, - category: Optional[str] = None, - lsp_max_depth: int = 1, - lsp_max_nodes: int = 20, - ) -> Dict[str, List[SearchResult]]: - """Execute parallel searches across enabled backends. - - Args: - index_path: Path to _index.db file - query: FTS5 query string - backends: Dictionary of backend name to enabled flag - limit: Results limit per backend - category: Optional category filter for vector search ('code' or 'doc') - lsp_max_depth: Maximum depth for LSP graph BFS expansion (default 1) - lsp_max_nodes: Maximum nodes to collect in LSP graph (default 20) - - Returns: - Dictionary mapping source name to results list - """ - results_map: Dict[str, List[SearchResult]] = {} - timing_data: Dict[str, float] = {} - - # Use ThreadPoolExecutor for parallel I/O-bound searches - with ThreadPoolExecutor(max_workers=len(backends)) as executor: - # Submit search tasks with timing - future_to_source = {} - submit_times = {} - - if backends.get("exact"): - submit_times["exact"] = time.perf_counter() - future = executor.submit( - self._search_exact, index_path, query, limit - ) - future_to_source[future] = "exact" - - if backends.get("fuzzy"): - submit_times["fuzzy"] = time.perf_counter() - future = executor.submit( - self._search_fuzzy, index_path, query, limit - ) - future_to_source[future] = "fuzzy" - - if backends.get("vector"): - submit_times["vector"] = time.perf_counter() - future = executor.submit( - self._search_vector, index_path, query, limit, category - ) - future_to_source[future] = "vector" - - if backends.get("lsp_graph"): - submit_times["lsp_graph"] = time.perf_counter() - future = executor.submit( - self._search_lsp_graph, index_path, query, limit, - lsp_max_depth, lsp_max_nodes - ) - future_to_source[future] = "lsp_graph" - - # Collect results as they complete with timeout protection - try: - for future in as_completed(future_to_source, timeout=30.0): - source = future_to_source[future] - elapsed_ms = (time.perf_counter() - submit_times[source]) * 1000 - timing_data[source] = elapsed_ms - try: - results = future.result(timeout=10.0) - # Tag results with source for debugging - tagged_results = tag_search_source(results, source) - results_map[source] = tagged_results - self.logger.debug( - "[TIMING] %s_search: %.2fms (%d results)", - source, elapsed_ms, len(results) - ) - except (Exception, FuturesTimeoutError) as exc: - self.logger.error("Search failed for %s: %s", source, exc) - results_map[source] = [] - except FuturesTimeoutError: - self.logger.warning("Search timeout: some backends did not respond in time") - # Cancel remaining futures - for future in future_to_source: - future.cancel() - # Set empty results for sources that didn't complete - for source in backends: - if source not in results_map: - results_map[source] = [] - - # Log timing summary - if timing_data: - timing_str = ", ".join(f"{k}={v:.1f}ms" for k, v in timing_data.items()) - self.logger.debug("[TIMING] search_backends: {%s}", timing_str) - - return results_map - - def _search_exact( - self, index_path: Path, query: str, limit: int - ) -> List[SearchResult]: - """Execute exact FTS search using unicode61 tokenizer. - - Args: - index_path: Path to _index.db file - query: FTS5 query string - limit: Maximum results - - Returns: - List of SearchResult objects - """ - try: - with DirIndexStore(index_path) as store: - return store.search_fts_exact( - query, limit=limit, return_full_content=True - ) - except Exception as exc: - self.logger.debug("Exact search error: %s", exc) - return [] - - def _search_fuzzy( - self, index_path: Path, query: str, limit: int - ) -> List[SearchResult]: - """Execute fuzzy FTS search using trigram/extended unicode61 tokenizer. - - Args: - index_path: Path to _index.db file - query: FTS5 query string - limit: Maximum results - - Returns: - List of SearchResult objects - """ - try: - with DirIndexStore(index_path) as store: - return store.search_fts_fuzzy( - query, limit=limit, return_full_content=True - ) - except Exception as exc: - self.logger.debug("Fuzzy search error: %s", exc) - return [] - - def _find_vectors_hnsw(self, index_path: Path) -> Optional[Path]: - """Find the centralized _vectors.hnsw file by traversing up from index_path. - - Searches for the centralized dense vector index file in parent directories. - - Args: - index_path: Path to the current _index.db file - - Returns: - Path to _vectors.hnsw if found, None otherwise - """ - current_dir = index_path.parent - for _ in range(10): # Limit search depth - candidate = current_dir / VECTORS_HNSW_NAME - if candidate.exists(): - return candidate - parent = current_dir.parent - if parent == current_dir: # Reached root - break - current_dir = parent - return None - - def _search_vector_centralized( - self, - index_path: Path, - hnsw_path: Path, - query: str, - limit: int, - category: Optional[str] = None, - ) -> List[SearchResult]: - """Search using centralized vector index. - - Args: - index_path: Path to _index.db file (for metadata lookup) - hnsw_path: Path to centralized _vectors.hnsw file - query: Natural language query string - limit: Maximum results - category: Optional category filter ('code' or 'doc') - - Returns: - List of SearchResult objects ordered by semantic similarity - """ - try: - index_root = hnsw_path.parent - model_config = self._get_centralized_model_config(index_root) - if model_config is None: - self.logger.debug("Model config not found, will detect from cached embedder") - embedder, embedding_dim, embedder_key = self._get_centralized_embedder(model_config) - - # Load centralized ANN index - start_load = time.perf_counter() - ann_index = self._get_centralized_ann_index(index_root=index_root, dim=embedding_dim) - if ann_index is None: - self.logger.warning("Failed to load centralized vector index from %s", hnsw_path) - return [] - self.logger.debug( - "[TIMING] central_ann_load: %.2fms (%d vectors)", - (time.perf_counter() - start_load) * 1000, - ann_index.count() - ) - - # Generate query embedding - start_embed = time.perf_counter() - query_embedding = self._get_cached_query_embedding(query, embedder, embedder_key) - self.logger.debug( - "[TIMING] query_embedding: %.2fms", - (time.perf_counter() - start_embed) * 1000 - ) - - # Search ANN index - start_search = time.perf_counter() - import numpy as np - query_vec = np.array(query_embedding, dtype=np.float32) - ids, distances = ann_index.search(query_vec, top_k=limit * 2) # Fetch extra for filtering - self.logger.debug( - "[TIMING] central_ann_search: %.2fms (%d results)", - (time.perf_counter() - start_search) * 1000, - len(ids) if ids else 0 - ) - - if not ids: - return [] - - # Convert distances to similarity scores (for cosine: score = 1 - distance) - scores = [self._clamp_search_score(1.0 - d) for d in distances] - - # Fetch chunk metadata from semantic_chunks tables - # We need to search across all _index.db files in the project - results = self._fetch_chunks_by_ids_centralized( - index_root, ids, scores, category - ) - - return results[:limit] - - except ImportError as exc: - self.logger.debug("Semantic dependencies not available: %s", exc) - return [] - except Exception as exc: - self.logger.error("Centralized vector search error: %s", exc) - return [] - - def _fetch_chunks_by_ids_centralized( - self, - index_root: Path, - chunk_ids: List[int], - scores: List[float], - category: Optional[str] = None, - ) -> List[SearchResult]: - """Fetch chunk metadata from centralized _vectors_meta.db for fast lookup. - - This method uses the centralized VectorMetadataStore for O(1) lookup - instead of traversing all _index.db files (O(n) where n = number of indexes). - - Falls back to the legacy per-index lookup if centralized metadata is unavailable. - - Args: - index_root: Root directory containing _vectors_meta.db - chunk_ids: List of chunk IDs from ANN search - scores: Corresponding similarity scores - category: Optional category filter - - Returns: - List of SearchResult objects - """ - from codexlens.config import VECTORS_META_DB_NAME - - # Build score map - score_map = {cid: score for cid, score in zip(chunk_ids, scores)} - - # Try centralized metadata store first (fast path) - vectors_meta_path = index_root / VECTORS_META_DB_NAME - if vectors_meta_path.exists(): - try: - return self._fetch_from_vector_meta_store( - vectors_meta_path, chunk_ids, score_map, category - ) - except Exception as e: - self.logger.warning( - "Centralized metadata lookup failed, falling back to legacy traversal: %s. " - "Consider regenerating embeddings with: codexlens embeddings-generate --centralized", - e - ) - - # Fallback: traverse _index.db files (legacy path) - return self._fetch_chunks_by_ids_legacy( - index_root, chunk_ids, score_map, category - ) - - def _fetch_from_vector_meta_store( - self, - meta_db_path: Path, - chunk_ids: List[int], - score_map: Dict[int, float], - category: Optional[str] = None, - ) -> List[SearchResult]: - """Fetch chunks from centralized VectorMetadataStore. - - Args: - meta_db_path: Path to _vectors_meta.db - chunk_ids: List of chunk IDs to fetch - score_map: Mapping of chunk_id to score - category: Optional category filter - - Returns: - List of SearchResult objects - """ - from codexlens.storage.vector_meta_store import VectorMetadataStore - - results = [] - - with VectorMetadataStore(meta_db_path) as meta_store: - rows = meta_store.get_chunks_by_ids(chunk_ids, category=category) - - for row in rows: - chunk_id = row["chunk_id"] - file_path = row["file_path"] - content = row["content"] or "" - metadata = row.get("metadata") or {} - start_line = row.get("start_line") - end_line = row.get("end_line") - - score = self._clamp_search_score(score_map.get(chunk_id, 0.0)) - - # Build excerpt - excerpt = content[:200] + "..." if len(content) > 200 else content - - # Extract symbol information - symbol_name = metadata.get("symbol_name") - symbol_kind = metadata.get("symbol_kind") - - # Build Symbol object if available - symbol = None - if symbol_name and symbol_kind and start_line and end_line: - try: - from codexlens.entities import Symbol - symbol = Symbol( - name=symbol_name, - kind=symbol_kind, - range=(start_line, end_line) - ) - except Exception: - pass - - results.append(SearchResult( - path=file_path, - score=score, - excerpt=excerpt, - content=content, - symbol=symbol, - metadata=metadata, - start_line=start_line, - end_line=end_line, - symbol_name=symbol_name, - symbol_kind=symbol_kind, - )) - - # Sort by score descending - results.sort(key=lambda r: r.score, reverse=True) - return results - - def _fetch_chunks_by_ids_legacy( - self, - index_root: Path, - chunk_ids: List[int], - score_map: Dict[int, float], - category: Optional[str] = None, - ) -> List[SearchResult]: - """Legacy fallback: fetch chunk metadata by traversing all _index.db files. - - This is the O(n) fallback path used when centralized metadata is unavailable. - - Args: - index_root: Root directory containing _index.db files - chunk_ids: List of chunk IDs from ANN search - score_map: Mapping of chunk_id to score - category: Optional category filter - - Returns: - List of SearchResult objects - """ - import sqlite3 - import json - - # Find all _index.db files - index_files = filter_index_paths(index_root.rglob("_index.db"), index_root) - - results = [] - found_ids = set() - - for index_path in index_files: - try: - with sqlite3.connect(index_path) as conn: - conn.row_factory = sqlite3.Row - - # Check if semantic_chunks table exists - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'" - ) - if cursor.fetchone() is None: - continue - - # Build query for chunk IDs we haven't found yet - remaining_ids = [cid for cid in chunk_ids if cid not in found_ids] - if not remaining_ids: - break - - placeholders = ",".join("?" * len(remaining_ids)) - - if category: - query = f""" - SELECT id, file_path, content, metadata - FROM semantic_chunks - WHERE id IN ({placeholders}) AND category = ? - """ - params = remaining_ids + [category] - else: - query = f""" - SELECT id, file_path, content, metadata - FROM semantic_chunks - WHERE id IN ({placeholders}) - """ - params = remaining_ids - - rows = conn.execute(query, params).fetchall() - - for row in rows: - chunk_id = row["id"] - if chunk_id in found_ids: - continue - found_ids.add(chunk_id) - - file_path = row["file_path"] - content = row["content"] - metadata_json = row["metadata"] - metadata = json.loads(metadata_json) if metadata_json else {} - - score = self._clamp_search_score(score_map.get(chunk_id, 0.0)) - - # Build excerpt - excerpt = content[:200] + "..." if len(content) > 200 else content - - # Extract symbol information - symbol_name = metadata.get("symbol_name") - symbol_kind = metadata.get("symbol_kind") - start_line = metadata.get("start_line") - end_line = metadata.get("end_line") - - # Build Symbol object if available - symbol = None - if symbol_name and symbol_kind and start_line and end_line: - try: - from codexlens.entities import Symbol - symbol = Symbol( - name=symbol_name, - kind=symbol_kind, - range=(start_line, end_line) - ) - except Exception: - pass - - results.append(SearchResult( - path=file_path, - score=score, - excerpt=excerpt, - content=content, - symbol=symbol, - metadata=metadata, - start_line=start_line, - end_line=end_line, - symbol_name=symbol_name, - symbol_kind=symbol_kind, - )) - - except Exception as e: - self.logger.debug("Failed to fetch chunks from %s: %s", index_path, e) - continue - - # Sort by score descending - results.sort(key=lambda r: r.score, reverse=True) - return results - - def _search_vector( - self, index_path: Path, query: str, limit: int, category: Optional[str] = None - ) -> List[SearchResult]: - """Execute vector similarity search using semantic embeddings. - - Supports both centralized vector storage (single _vectors.hnsw at project root) - and distributed storage (per-directory .hnsw files). - - Args: - index_path: Path to _index.db file - query: Natural language query string - limit: Maximum results - category: Optional category filter ('code' or 'doc') - - Returns: - List of SearchResult objects ordered by semantic similarity - """ - try: - # First, check for centralized vector index - central_hnsw_path = self._find_vectors_hnsw(index_path) - if central_hnsw_path is not None: - self.logger.debug("Found centralized vector index at %s", central_hnsw_path) - return self._search_vector_centralized( - index_path, central_hnsw_path, query, limit, category - ) - - # Fallback to distributed (per-index) vector storage - # Check if semantic chunks table exists - import sqlite3 - - start_check = time.perf_counter() - try: - with sqlite3.connect(index_path) as conn: - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'" - ) - has_semantic_table = cursor.fetchone() is not None - except sqlite3.Error as e: - self.logger.error("Database check failed in vector search: %s", e) - return [] - self.logger.debug( - "[TIMING] vector_table_check: %.2fms", - (time.perf_counter() - start_check) * 1000 - ) - - if not has_semantic_table: - self.logger.info( - "No embeddings found in index. " - "Generate embeddings with: codexlens embeddings-generate %s", - index_path.parent if index_path.name == "_index.db" else index_path - ) - return [] - - # Initialize embedder and vector store - from codexlens.semantic.factory import get_embedder - from codexlens.semantic.vector_store import VectorStore - - start_init = time.perf_counter() - vector_store = VectorStore(index_path) - self.logger.debug( - "[TIMING] vector_store_init: %.2fms", - (time.perf_counter() - start_init) * 1000 - ) - - # Check if vector store has data - if vector_store.count_chunks() == 0: - self.logger.info( - "Vector store is empty (0 chunks). " - "Generate embeddings with: codexlens embeddings-generate %s", - index_path.parent if index_path.name == "_index.db" else index_path - ) - return [] - - # Get stored model configuration (preferred) or auto-detect from dimension - start_embedder = time.perf_counter() - model_config = vector_store.get_model_config() - if model_config: - backend = model_config.get("backend", "fastembed") - model_name = model_config["model_name"] - model_profile = model_config["model_profile"] - self.logger.debug( - "Using stored model config: %s backend, %s (%s, %dd)", - backend, model_profile, model_name, model_config["embedding_dim"] - ) - - # Get embedder based on backend - if backend == "litellm": - embedder = get_embedder(backend="litellm", model=model_name) - else: - embedder = get_embedder(backend="fastembed", profile=model_profile) - else: - # Fallback: auto-detect from embedding dimension - detected_dim = vector_store.dimension - if detected_dim is None: - self.logger.info("Vector store dimension unknown, using default profile") - embedder = get_embedder(backend="fastembed", profile="code") - elif detected_dim == 384: - embedder = get_embedder(backend="fastembed", profile="fast") - elif detected_dim == 768: - embedder = get_embedder(backend="fastembed", profile="code") - elif detected_dim == 1024: - embedder = get_embedder(backend="fastembed", profile="multilingual") - elif detected_dim == 1536: - # Likely OpenAI text-embedding-3-small or ada-002 - self.logger.info( - "Detected 1536-dim embeddings (likely OpenAI), using litellm backend with text-embedding-3-small" - ) - embedder = get_embedder(backend="litellm", model="text-embedding-3-small") - elif detected_dim == 3072: - # Likely OpenAI text-embedding-3-large - self.logger.info( - "Detected 3072-dim embeddings (likely OpenAI), using litellm backend with text-embedding-3-large" - ) - embedder = get_embedder(backend="litellm", model="text-embedding-3-large") - else: - self.logger.debug( - "Unknown dimension %s, using default fastembed profile 'code'", - detected_dim - ) - embedder = get_embedder(backend="fastembed", profile="code") - self.logger.debug( - "[TIMING] embedder_init: %.2fms", - (time.perf_counter() - start_embedder) * 1000 - ) - - # Generate query embedding - start_embed = time.perf_counter() - query_embedding = embedder.embed_single(query) - self.logger.debug( - "[TIMING] query_embedding: %.2fms", - (time.perf_counter() - start_embed) * 1000 - ) - - # Search for similar chunks - start_search = time.perf_counter() - results = vector_store.search_similar( - query_embedding=query_embedding, - top_k=limit, - min_score=0.0, # Return all results, let RRF handle filtering - return_full_content=True, - category=category, - ) - self.logger.debug( - "[TIMING] vector_similarity_search: %.2fms (%d results)", - (time.perf_counter() - start_search) * 1000, len(results) - ) - - return results - - except ImportError as exc: - self.logger.debug("Semantic dependencies not available: %s", exc) - return [] - except Exception as exc: - self.logger.error("Vector search error: %s", exc) - return [] - - def _search_lsp_graph( - self, - index_path: Path, - query: str, - limit: int, - max_depth: int = 1, - max_nodes: int = 20, - ) -> List[SearchResult]: - """Execute LSP-based graph expansion search. - - Uses real-time LSP to expand from seed results and find related code. - This provides accurate, up-to-date code relationships. - - Args: - index_path: Path to _index.db file - query: Natural language query string - limit: Maximum results - max_depth: Maximum depth for LSP graph BFS expansion (default 1) - max_nodes: Maximum nodes to collect in LSP graph (default 20) - - Returns: - List of SearchResult from graph expansion - """ - import asyncio - - if not HAS_LSP: - self.logger.debug("LSP dependencies not available") - return [] - - try: - # Try multiple seed sources in priority order - seeds = [] - seed_source = "none" - - # 1. Try vector search first (best semantic match) - seeds = self._search_vector(index_path, query, limit=3, category="code") - if seeds: - seed_source = "vector" - - # 2. Fallback to exact FTS if vector returns nothing - if not seeds: - self.logger.debug("Vector search returned no seeds, trying exact FTS") - seeds = self._search_exact(index_path, query, limit=3) - if seeds: - seed_source = "exact_fts" - - # 3. No seeds available from any source - if not seeds: - self.logger.debug("No seed results available for LSP graph expansion") - return [] - - self.logger.debug( - "LSP graph expansion using %d seeds from %s", - len(seeds), - seed_source, - ) - - # Convert SearchResult to CodeSymbolNode for LSP processing - from codexlens.hybrid_search.data_structures import CodeSymbolNode, Range - - seed_nodes = [] - for seed in seeds: - try: - node = CodeSymbolNode( - id=f"{seed.path}:{seed.symbol_name or 'unknown'}:{seed.start_line or 0}", - name=seed.symbol_name or "unknown", - kind=seed.symbol_kind or "unknown", - file_path=seed.path, - range=Range( - start_line=seed.start_line or 1, - start_character=0, - end_line=seed.end_line or seed.start_line or 1, - end_character=0, - ), - raw_code=seed.content or "", - docstring=seed.excerpt or "", - ) - seed_nodes.append(node) - except Exception as e: - self.logger.debug("Failed to create seed node: %s", e) - continue - - if not seed_nodes: - return [] - - # Run async LSP expansion in sync context - async def expand_graph(): - async with LspBridge() as bridge: - builder = LspGraphBuilder(max_depth=max_depth, max_nodes=max_nodes) - graph = await builder.build_from_seeds(seed_nodes, bridge) - return graph - - # Run the async code - try: - loop = asyncio.get_event_loop() - if loop.is_running(): - # Already in async context - use run_coroutine_threadsafe - import concurrent.futures - future = asyncio.run_coroutine_threadsafe(expand_graph(), loop) - graph = future.result(timeout=5.0) - else: - graph = loop.run_until_complete(expand_graph()) - except RuntimeError: - # No event loop - create new one - graph = asyncio.run(expand_graph()) - - # Convert graph nodes to SearchResult - # Create set of seed identifiers for fast lookup - seed_ids = set() - for seed in seeds: - seed_id = f"{seed.path}:{seed.symbol_name or 'unknown'}:{seed.start_line or 0}" - seed_ids.add(seed_id) - - results = [] - for node_id, node in graph.nodes.items(): - # Skip seed nodes using ID comparison (already in other results) - if node_id in seed_ids or node.id in seed_ids: - continue - - # Calculate score based on graph position - # Nodes closer to seeds get higher scores - depth = 1 # Simple heuristic, could be improved - score = 0.8 / (1 + depth) # Score decreases with depth - - results.append(SearchResult( - path=node.file_path, - score=score, - excerpt=node.docstring[:200] if node.docstring else node.raw_code[:200] if node.raw_code else "", - content=node.raw_code, - symbol=None, - metadata={"lsp_node_id": node_id, "lsp_kind": node.kind}, - start_line=node.range.start_line, - end_line=node.range.end_line, - symbol_name=node.name, - symbol_kind=node.kind, - )) - - # Sort by score - results.sort(key=lambda r: r.score, reverse=True) - return results[:limit] - - except Exception as exc: - self.logger.debug("LSP graph search error: %s", exc) - return [] diff --git a/codex-lens/src/codexlens/search/query_parser.py b/codex-lens/src/codexlens/search/query_parser.py deleted file mode 100644 index 05b337f5..00000000 --- a/codex-lens/src/codexlens/search/query_parser.py +++ /dev/null @@ -1,242 +0,0 @@ -"""Query preprocessing for CodexLens search. - -Provides query expansion for better identifier matching: -- CamelCase splitting: UserAuth → User OR Auth -- snake_case splitting: user_auth → user OR auth -- Preserves original query for exact matching -""" - -from __future__ import annotations - -import logging -import re -from typing import Set, List - -log = logging.getLogger(__name__) - - -class QueryParser: - """Parser for preprocessing search queries before FTS5 execution. - - Expands identifier-style queries (CamelCase, snake_case) into OR queries - to improve recall when searching for code symbols. - - Example transformations: - - 'UserAuth' → 'UserAuth OR User OR Auth' - - 'user_auth' → 'user_auth OR user OR auth' - - 'getUserData' → 'getUserData OR get OR User OR Data' - """ - - # Patterns for identifier splitting - CAMEL_CASE_PATTERN = re.compile(r'([a-z])([A-Z])') - SNAKE_CASE_PATTERN = re.compile(r'_+') - KEBAB_CASE_PATTERN = re.compile(r'-+') - - # Minimum token length to include in expansion (avoid noise from single chars) - MIN_TOKEN_LENGTH = 2 - - # All-caps acronyms pattern (e.g., HTTP, SQL, API) - ALL_CAPS_PATTERN = re.compile(r'^[A-Z]{2,}$') - - def __init__(self, enable: bool = True, min_token_length: int = 2): - """Initialize query parser. - - Args: - enable: Whether to enable query preprocessing - min_token_length: Minimum token length to include in expansion - """ - self.enable = enable - self.min_token_length = min_token_length - - def preprocess_query(self, query: str) -> str: - """Preprocess query with identifier expansion. - - Args: - query: Original search query - - Returns: - Expanded query with OR operator connecting original and split tokens - - Example: - >>> parser = QueryParser() - >>> parser.preprocess_query('UserAuth') - 'UserAuth OR User OR Auth' - >>> parser.preprocess_query('get_user_data') - 'get_user_data OR get OR user OR data' - """ - if not self.enable: - return query - - query = query.strip() - if not query: - return query - - # Extract tokens from query (handle multiple words/terms) - # For simple queries, just process the whole thing - # For complex FTS5 queries with operators, preserve structure - if self._is_simple_query(query): - return self._expand_simple_query(query) - else: - # Complex query with FTS5 operators, don't expand - log.debug(f"Skipping expansion for complex FTS5 query: {query}") - return query - - def _is_simple_query(self, query: str) -> bool: - """Check if query is simple (no FTS5 operators). - - Args: - query: Search query - - Returns: - True if query is simple (safe to expand), False otherwise - """ - # Check for FTS5 operators that indicate complex query - fts5_operators = ['OR', 'AND', 'NOT', 'NEAR', '*', '^', '"'] - return not any(op in query for op in fts5_operators) - - def _expand_simple_query(self, query: str) -> str: - """Expand a simple query with identifier splitting. - - Args: - query: Simple search query - - Returns: - Expanded query with OR operators - """ - tokens: Set[str] = set() - - # Always include original query - tokens.add(query) - - # Split on whitespace first - words = query.split() - - for word in words: - # Extract tokens from this word - word_tokens = self._extract_tokens(word) - tokens.update(word_tokens) - - # Filter out short tokens and duplicates - filtered_tokens = [ - t for t in tokens - if len(t) >= self.min_token_length - ] - - # Remove duplicates while preserving original query first - unique_tokens: List[str] = [] - seen: Set[str] = set() - - # Always put original query first - if query not in seen and len(query) >= self.min_token_length: - unique_tokens.append(query) - seen.add(query) - - # Add other tokens - for token in filtered_tokens: - if token not in seen: - unique_tokens.append(token) - seen.add(token) - - # Join with OR operator (only if we have multiple tokens) - if len(unique_tokens) > 1: - expanded = ' OR '.join(unique_tokens) - log.debug(f"Expanded query: '{query}' → '{expanded}'") - return expanded - else: - return query - - def _extract_tokens(self, word: str) -> Set[str]: - """Extract tokens from a single word using various splitting strategies. - - Args: - word: Single word/identifier to split - - Returns: - Set of extracted tokens - """ - tokens: Set[str] = set() - - # Add original word - tokens.add(word) - - # Handle all-caps acronyms (don't split) - if self.ALL_CAPS_PATTERN.match(word): - return tokens - - # CamelCase splitting - camel_tokens = self._split_camel_case(word) - tokens.update(camel_tokens) - - # snake_case splitting - snake_tokens = self._split_snake_case(word) - tokens.update(snake_tokens) - - # kebab-case splitting - kebab_tokens = self._split_kebab_case(word) - tokens.update(kebab_tokens) - - return tokens - - def _split_camel_case(self, word: str) -> List[str]: - """Split CamelCase identifier into tokens. - - Args: - word: CamelCase identifier (e.g., 'getUserData') - - Returns: - List of tokens (e.g., ['get', 'User', 'Data']) - """ - # Insert space before uppercase letters preceded by lowercase - spaced = self.CAMEL_CASE_PATTERN.sub(r'\1 \2', word) - # Split on spaces and filter empty - return [t for t in spaced.split() if t] - - def _split_snake_case(self, word: str) -> List[str]: - """Split snake_case identifier into tokens. - - Args: - word: snake_case identifier (e.g., 'get_user_data') - - Returns: - List of tokens (e.g., ['get', 'user', 'data']) - """ - # Split on underscores - return [t for t in self.SNAKE_CASE_PATTERN.split(word) if t] - - def _split_kebab_case(self, word: str) -> List[str]: - """Split kebab-case identifier into tokens. - - Args: - word: kebab-case identifier (e.g., 'get-user-data') - - Returns: - List of tokens (e.g., ['get', 'user', 'data']) - """ - # Split on hyphens - return [t for t in self.KEBAB_CASE_PATTERN.split(word) if t] - - -# Global default parser instance -_default_parser = QueryParser(enable=True) - - -def preprocess_query(query: str, enable: bool = True) -> str: - """Convenience function for query preprocessing. - - Args: - query: Original search query - enable: Whether to enable preprocessing - - Returns: - Preprocessed query with identifier expansion - """ - if not enable: - return query - - return _default_parser.preprocess_query(query) - - -__all__ = [ - "QueryParser", - "preprocess_query", -] diff --git a/codex-lens/src/codexlens/search/ranking.py b/codex-lens/src/codexlens/search/ranking.py deleted file mode 100644 index 5c6bf346..00000000 --- a/codex-lens/src/codexlens/search/ranking.py +++ /dev/null @@ -1,1701 +0,0 @@ -"""Ranking algorithms for hybrid search result fusion. - -Implements Reciprocal Rank Fusion (RRF) and score normalization utilities -for combining results from heterogeneous search backends (exact FTS, fuzzy FTS, vector search). -""" - -from __future__ import annotations - -import logging -import re -import math -from enum import Enum -from pathlib import Path -from typing import Any, Dict, List, Optional - -from codexlens.entities import SearchResult, AdditionalLocation - -logger = logging.getLogger(__name__) - - -# Default RRF weights for hybrid search -DEFAULT_WEIGHTS = { - "exact": 0.25, - "fuzzy": 0.1, - "vector": 0.5, - "lsp_graph": 0.15, -} - - -class QueryIntent(str, Enum): - """Query intent for adaptive RRF weights (Python/TypeScript parity).""" - - KEYWORD = "keyword" - SEMANTIC = "semantic" - MIXED = "mixed" - - -_TEST_QUERY_RE = re.compile( - r"\b(test|tests|spec|specs|fixture|fixtures|benchmark|benchmarks)\b", - flags=re.IGNORECASE, -) -_AUXILIARY_QUERY_RE = re.compile( - r"\b(example|examples|demo|demos|sample|samples|debug|benchmark|benchmarks|profile|profiling)\b", - flags=re.IGNORECASE, -) -_ARTIFACT_QUERY_RE = re.compile( - r"(? Dict[str, float | None]: - """Normalize weights to sum to 1.0 (best-effort).""" - total = sum(float(v) for v in weights.values() if v is not None) - - # NaN total: do not attempt to normalize (division would propagate NaNs). - if math.isnan(total): - return dict(weights) - - # Infinite total: do not attempt to normalize (division yields 0 or NaN). - if not math.isfinite(total): - return dict(weights) - - # Zero/negative total: do not attempt to normalize (invalid denominator). - if total <= 0: - return dict(weights) - - return {k: (float(v) / total if v is not None else None) for k, v in weights.items()} - - -def detect_query_intent(query: str) -> QueryIntent: - """Detect whether a query is code-like, natural-language, or mixed. - - Heuristic signals kept aligned with `ccw/src/tools/smart-search.ts`. - """ - trimmed = (query or "").strip() - if not trimmed: - return QueryIntent.MIXED - - lower = trimmed.lower() - word_count = len([w for w in re.split(r"\s+", trimmed) if w]) - - has_code_signals = bool( - re.search(r"(::|->|\.)", trimmed) - or re.search(r"[A-Z][a-z]+[A-Z]", trimmed) - or re.search(r"\b[a-z]+[A-Z][A-Za-z0-9_]*\b", trimmed) - or re.search(r"\b\w+_\w+\b", trimmed) - or re.search( - r"\b(def|class|function|const|let|var|import|from|return|async|await|interface|type)\b", - lower, - flags=re.IGNORECASE, - ) - ) - has_natural_signals = bool( - word_count > 5 - or "?" in trimmed - or re.search(r"\b(how|what|why|when|where)\b", trimmed, flags=re.IGNORECASE) - or re.search( - r"\b(handle|explain|fix|implement|create|build|use|find|search|convert|parse|generate|support)\b", - trimmed, - flags=re.IGNORECASE, - ) - ) - - if has_code_signals and has_natural_signals: - return QueryIntent.MIXED - if has_code_signals: - return QueryIntent.KEYWORD - if has_natural_signals: - return QueryIntent.SEMANTIC - return QueryIntent.MIXED - - -def adjust_weights_by_intent( - intent: QueryIntent, - base_weights: Dict[str, float], -) -> Dict[str, float]: - """Adjust RRF weights based on query intent.""" - if intent == QueryIntent.KEYWORD: - target = {"exact": 0.5, "fuzzy": 0.1, "vector": 0.4} - elif intent == QueryIntent.SEMANTIC: - target = {"exact": 0.2, "fuzzy": 0.1, "vector": 0.7} - else: - target = dict(base_weights) - - # Filter to active backends - keys = list(base_weights.keys()) - filtered = {k: float(target.get(k, 0.0)) for k in keys} - return normalize_weights(filtered) - - -def get_rrf_weights( - query: str, - base_weights: Dict[str, float], -) -> Dict[str, float]: - """Compute adaptive RRF weights from query intent.""" - return adjust_weights_by_intent(detect_query_intent(query), base_weights) - - -def query_targets_test_files(query: str) -> bool: - """Return True when the query explicitly targets tests/spec fixtures.""" - return bool(_TEST_QUERY_RE.search((query or "").strip())) - - -def query_targets_generated_files(query: str) -> bool: - """Return True when the query explicitly targets generated/build artifacts.""" - return bool(_ARTIFACT_QUERY_RE.search((query or "").strip())) - - -def query_targets_auxiliary_files(query: str) -> bool: - """Return True when the query explicitly targets examples, benchmarks, or debug files.""" - return bool(_AUXILIARY_QUERY_RE.search((query or "").strip())) - - -def query_prefers_lexical_search(query: str) -> bool: - """Return True when config/env/factory style queries are safer with lexical-first search.""" - trimmed = (query or "").strip() - if not trimmed: - return False - - if _ENV_STYLE_QUERY_RE.search(trimmed): - return True - - query_tokens = set(_semantic_query_topic_tokens(trimmed)) - if not query_tokens: - return False - - if query_tokens.intersection({"factory", "factories"}): - return True - - if query_tokens.intersection({"environment", "env"}) and query_tokens.intersection({"variable", "variables"}): - return True - - if "backend" in query_tokens and query_tokens.intersection( - {"embedding", "embeddings", "reranker", "rerankers", "onnx", "api", "litellm", "fastembed", "local", "legacy"} - ): - return True - - surface_hits = query_tokens.intersection(_LEXICAL_PRIORITY_SURFACE_TOKENS) - focus_hits = query_tokens.intersection(_LEXICAL_PRIORITY_FOCUS_TOKENS) - return bool(surface_hits and focus_hits) - - -def _normalized_path_parts(path: str) -> List[str]: - """Normalize a path string into casefolded components for heuristics.""" - normalized = (path or "").replace("\\", "/") - return [part.casefold() for part in normalized.split("/") if part and part != "."] - - -# File extensions to category mapping for fast lookup -_EXT_TO_CATEGORY: Dict[str, str] = { - # Code extensions - ".py": "code", ".js": "code", ".jsx": "code", ".ts": "code", ".tsx": "code", - ".java": "code", ".go": "code", ".zig": "code", ".m": "code", ".mm": "code", - ".c": "code", ".h": "code", ".cc": "code", ".cpp": "code", ".hpp": "code", ".cxx": "code", - ".rs": "code", - # Doc extensions - ".md": "doc", ".mdx": "doc", ".txt": "doc", ".rst": "doc", -} - - -def get_file_category(path: str) -> Optional[str]: - """Get file category ('code' or 'doc') from path extension. - - Args: - path: File path string - - Returns: - 'code', 'doc', or None if unknown - """ - ext = Path(path).suffix.lower() - return _EXT_TO_CATEGORY.get(ext) - - -def filter_results_by_category( - results: List[SearchResult], - intent: QueryIntent, - allow_mixed: bool = True, -) -> List[SearchResult]: - """Filter results by category based on query intent. - - Strategy: - - KEYWORD (code intent): Only return code files - - SEMANTIC (doc intent): Prefer docs, but allow code if allow_mixed=True - - MIXED: Return all results - - Args: - results: List of SearchResult objects - intent: Query intent from detect_query_intent() - allow_mixed: If True, SEMANTIC intent includes code files with lower priority - - Returns: - Filtered and re-ranked list of SearchResult objects - """ - if not results or intent == QueryIntent.MIXED: - return results - - code_results = [] - doc_results = [] - unknown_results = [] - - for r in results: - category = get_file_category(r.path) - if category == "code": - code_results.append(r) - elif category == "doc": - doc_results.append(r) - else: - unknown_results.append(r) - - if intent == QueryIntent.KEYWORD: - # Code intent: return only code files + unknown (might be code) - filtered = code_results + unknown_results - elif intent == QueryIntent.SEMANTIC: - if allow_mixed: - # Semantic intent with mixed: docs first, then code - filtered = doc_results + code_results + unknown_results - else: - # Semantic intent strict: only docs - filtered = doc_results + unknown_results - else: - filtered = results - - return filtered - - -def is_test_file(path: str) -> bool: - """Return True when a path clearly refers to a test/spec file.""" - parts = _normalized_path_parts(path) - if not parts: - return False - basename = parts[-1] - return ( - basename.startswith("test_") - or basename.endswith("_test.py") - or basename.endswith(".test.ts") - or basename.endswith(".test.tsx") - or basename.endswith(".test.js") - or basename.endswith(".test.jsx") - or basename.endswith(".spec.ts") - or basename.endswith(".spec.tsx") - or basename.endswith(".spec.js") - or basename.endswith(".spec.jsx") - or "tests" in parts[:-1] - or "test" in parts[:-1] - or "__fixtures__" in parts[:-1] - or "fixtures" in parts[:-1] - ) - - -def is_generated_artifact_path(path: str) -> bool: - """Return True when a path clearly points at generated/build artifacts.""" - parts = _normalized_path_parts(path) - if not parts: - return False - basename = parts[-1] - return any(part in _GENERATED_DIR_NAMES for part in parts[:-1]) or basename.endswith( - _GENERATED_FILE_SUFFIXES - ) - - -def is_auxiliary_reference_path(path: str) -> bool: - """Return True for examples, benchmarks, demos, and debug helper files.""" - parts = _normalized_path_parts(path) - if not parts: - return False - basename = parts[-1] - if any(part in _AUXILIARY_DIR_NAMES for part in parts[:-1]): - return True - return ( - basename.startswith("debug_") - or basename.startswith("benchmark") - or basename.startswith("profile_") - or "_benchmark" in basename - or "_profile" in basename - ) - - -def _extract_identifier_query(query: str) -> Optional[str]: - """Return a single-token identifier query when definition boosting is safe.""" - trimmed = (query or "").strip() - if not trimmed or " " in trimmed: - return None - if not _IDENTIFIER_QUERY_RE.fullmatch(trimmed): - return None - return trimmed - - -def extract_explicit_path_hints(query: str) -> List[List[str]]: - """Extract explicit path/file hints from separator-style query tokens. - - Natural-language queries often contain one or two high-signal feature/file - hints such as ``smart_search`` or ``smart-search.ts`` alongside broader - platform words like ``CodexLens``. These hints should be treated as more - specific than the surrounding prose. - """ - hints: List[List[str]] = [] - seen: set[tuple[str, ...]] = set() - for raw_part in re.split(r"\s+", query or ""): - candidate = raw_part.strip().strip("\"'`()[]{}<>:,;") - if not candidate or not _EXPLICIT_PATH_HINT_MARKER_RE.search(candidate): - continue - tokens = [ - token - for token in _split_identifier_like_tokens(candidate) - if token not in _PATH_TOPIC_STOPWORDS - ] - if len(tokens) < 2: - continue - key = tuple(tokens) - if key in seen: - continue - seen.add(key) - hints.append(list(key)) - return hints - - -def _is_source_implementation_path(path: str) -> bool: - """Return True when a path looks like an implementation file under a source dir.""" - parts = _normalized_path_parts(path) - if not parts: - return False - return any(part in _SOURCE_DIR_NAMES for part in parts[:-1]) - - -def _result_text_candidates(result: SearchResult) -> List[str]: - """Collect short text snippets that may contain a symbol definition.""" - candidates: List[str] = [] - for text in (result.excerpt, result.content): - if not isinstance(text, str) or not text.strip(): - continue - for line in text.splitlines(): - stripped = line.strip() - if stripped: - candidates.append(stripped) - if len(candidates) >= 6: - break - if len(candidates) >= 6: - break - - symbol_name = result.symbol_name - if not symbol_name and result.symbol is not None: - symbol_name = getattr(result.symbol, "name", None) - if isinstance(symbol_name, str) and symbol_name.strip(): - candidates.append(symbol_name.strip()) - return candidates - - -def _result_defines_identifier(result: SearchResult, symbol: str) -> bool: - """Best-effort check for whether a result snippet looks like a symbol definition.""" - escaped_symbol = re.escape(symbol) - definition_patterns = ( - rf"^\s*(?:export\s+)?(?:default\s+)?(?:async\s+)?def\s+{escaped_symbol}\b", - rf"^\s*(?:export\s+)?(?:default\s+)?(?:async\s+)?function\s+{escaped_symbol}\b", - rf"^\s*(?:export\s+)?(?:default\s+)?class\s+{escaped_symbol}\b", - rf"^\s*(?:export\s+)?(?:default\s+)?interface\s+{escaped_symbol}\b", - rf"^\s*(?:export\s+)?(?:default\s+)?type\s+{escaped_symbol}\b", - rf"^\s*(?:export\s+)?(?:default\s+)?(?:const|let|var)\s+{escaped_symbol}\b", - rf"^\s*{escaped_symbol}\s*=\s*(?:async\s+)?\(", - rf"^\s*{escaped_symbol}\s*=\s*(?:async\s+)?[^=]*=>", - ) - for candidate in _result_text_candidates(result): - if any(re.search(pattern, candidate) for pattern in definition_patterns): - return True - return False - - -def _split_identifier_like_tokens(text: str) -> List[str]: - """Split identifier-like text into normalized word tokens.""" - if not text: - return [] - - tokens: List[str] = [] - for raw_token in _TOPIC_TOKEN_RE.findall(text): - expanded = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", raw_token) - expanded = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", expanded) - for token in expanded.split(): - normalized = _normalize_topic_token(token) - if normalized: - tokens.append(normalized) - return tokens - - -def _normalize_topic_token(token: str) -> Optional[str]: - """Normalize lightweight topic tokens for query/path overlap heuristics.""" - normalized = (token or "").casefold() - if len(normalized) < 2 or normalized.isdigit(): - return None - if len(normalized) > 4 and normalized.endswith("ies"): - normalized = f"{normalized[:-3]}y" - elif len(normalized) > 3 and normalized.endswith("s") and not normalized.endswith("ss"): - normalized = normalized[:-1] - return normalized or None - - -def _dedupe_preserve_order(tokens: List[str]) -> List[str]: - """Deduplicate tokens while preserving the first-seen order.""" - deduped: List[str] = [] - seen: set[str] = set() - for token in tokens: - if token in seen: - continue - seen.add(token) - deduped.append(token) - return deduped - - -def _semantic_query_topic_tokens(query: str) -> List[str]: - """Extract salient natural-language tokens for lightweight topic matching.""" - tokens = [ - token - for token in _split_identifier_like_tokens(query) - if token not in _SEMANTIC_QUERY_STOPWORDS - ] - return _dedupe_preserve_order(tokens) - - -def _path_topic_tokens(path: str) -> tuple[List[str], List[str]]: - """Extract normalized topic tokens from a path and its basename.""" - parts = _normalized_path_parts(path) - if not parts: - return [], [] - - path_tokens: List[str] = [] - basename_tokens: List[str] = [] - last_index = len(parts) - 1 - for index, part in enumerate(parts): - target = basename_tokens if index == last_index else path_tokens - for token in _split_identifier_like_tokens(part): - if token in _PATH_TOPIC_STOPWORDS: - continue - target.append(token) - return _dedupe_preserve_order(path_tokens), _dedupe_preserve_order(basename_tokens) - - -def _source_path_topic_boost( - query: str, - path: str, - query_intent: QueryIntent, -) -> tuple[float, List[str]]: - """Return a path/topic boost when a query strongly overlaps a source path.""" - query_tokens = _semantic_query_topic_tokens(query) - if len(query_tokens) < 2: - return 1.0, [] - - path_tokens, basename_tokens = _path_topic_tokens(path) - if not path_tokens and not basename_tokens: - return 1.0, [] - - path_token_set = set(path_tokens) | set(basename_tokens) - basename_overlap = [token for token in query_tokens if token in basename_tokens] - all_overlap = [token for token in query_tokens if token in path_token_set] - explicit_hint_tokens = extract_explicit_path_hints(query) - - for hint_tokens in explicit_hint_tokens: - if basename_tokens == hint_tokens: - if query_intent == QueryIntent.KEYWORD: - return 4.5, hint_tokens[:3] - return 2.4, hint_tokens[:3] - if all(token in basename_tokens for token in hint_tokens): - if query_intent == QueryIntent.KEYWORD: - return 4.5, hint_tokens[:3] - return 1.6, hint_tokens[:3] - - if query_prefers_lexical_search(query): - lexical_surface_overlap = [ - token for token in basename_tokens if token in query_tokens and token in _LEXICAL_PRIORITY_SURFACE_TOKENS - ] - if lexical_surface_overlap: - lexical_overlap = lexical_surface_overlap[:3] - if query_intent == QueryIntent.KEYWORD: - return 5.5, lexical_overlap - return 5.0, lexical_overlap - - if query_intent == QueryIntent.KEYWORD: - if len(basename_overlap) >= 2: - # Multi-token identifier-style queries often name the feature/file directly. - # Give basename matches a stronger lift so they can survive workspace fan-out. - multiplier = min(4.5, 2.0 + 1.25 * float(len(basename_overlap))) - return multiplier, basename_overlap[:3] - if len(all_overlap) >= 3: - multiplier = min(2.0, 1.1 + 0.2 * len(all_overlap)) - return multiplier, all_overlap[:3] - return 1.0, [] - - if len(basename_overlap) >= 2: - multiplier = min(1.45, 1.15 + 0.1 * len(basename_overlap)) - return multiplier, basename_overlap[:3] - if len(all_overlap) >= 3: - multiplier = min(1.3, 1.05 + 0.05 * len(all_overlap)) - return multiplier, all_overlap[:3] - return 1.0, [] - - -def apply_path_penalties( - results: List[SearchResult], - query: str, - *, - test_file_penalty: float = 0.15, - generated_file_penalty: float = 0.35, -) -> List[SearchResult]: - """Apply lightweight path-based penalties to reduce noisy rankings.""" - if not results or (test_file_penalty <= 0 and generated_file_penalty <= 0): - return results - - query_intent = detect_query_intent(query) - skip_test_penalty = query_targets_test_files(query) - skip_auxiliary_penalty = query_targets_auxiliary_files(query) - skip_generated_penalty = query_targets_generated_files(query) - query_topic_tokens = _semantic_query_topic_tokens(query) - keyword_path_query = query_intent == QueryIntent.KEYWORD and len(query_topic_tokens) >= 2 - explicit_feature_query = bool(extract_explicit_path_hints(query)) - source_oriented_query = ( - explicit_feature_query - or keyword_path_query - or ( - query_intent in {QueryIntent.SEMANTIC, QueryIntent.MIXED} - and len(query_topic_tokens) >= 2 - ) - ) - identifier_query = None - if query_intent == QueryIntent.KEYWORD: - identifier_query = _extract_identifier_query(query) - effective_test_penalty = float(test_file_penalty) - if effective_test_penalty > 0 and not skip_test_penalty: - if query_intent == QueryIntent.KEYWORD: - # Identifier-style queries should prefer implementation files over test references. - effective_test_penalty = max(effective_test_penalty, 0.35) - elif query_intent in {QueryIntent.SEMANTIC, QueryIntent.MIXED}: - # Natural-language code queries should still prefer implementation files over references. - effective_test_penalty = max(effective_test_penalty, 0.25) - if explicit_feature_query: - # Explicit feature/file hints should be even more biased toward source implementations. - effective_test_penalty = max(effective_test_penalty, 0.45) - effective_auxiliary_penalty = effective_test_penalty - if effective_auxiliary_penalty > 0 and not skip_auxiliary_penalty and explicit_feature_query: - # Examples/benchmarks are usually descriptive noise for feature-targeted implementation queries. - effective_auxiliary_penalty = max(effective_auxiliary_penalty, 0.5) - effective_generated_penalty = float(generated_file_penalty) - if effective_generated_penalty > 0 and not skip_generated_penalty: - if source_oriented_query: - effective_generated_penalty = max(effective_generated_penalty, 0.45) - if explicit_feature_query: - effective_generated_penalty = max(effective_generated_penalty, 0.6) - - penalized: List[SearchResult] = [] - for result in results: - multiplier = 1.0 - penalty_multiplier = 1.0 - boost_multiplier = 1.0 - penalty_reasons: List[str] = [] - boost_reasons: List[str] = [] - - if effective_test_penalty > 0 and not skip_test_penalty and is_test_file(result.path): - penalty_multiplier *= max(0.0, 1.0 - effective_test_penalty) - penalty_reasons.append("test_file") - - if ( - effective_auxiliary_penalty > 0 - and not skip_auxiliary_penalty - and not is_test_file(result.path) - and is_auxiliary_reference_path(result.path) - ): - penalty_multiplier *= max(0.0, 1.0 - effective_auxiliary_penalty) - penalty_reasons.append("auxiliary_file") - - if ( - effective_generated_penalty > 0 - and not skip_generated_penalty - and is_generated_artifact_path(result.path) - ): - penalty_multiplier *= max(0.0, 1.0 - effective_generated_penalty) - penalty_reasons.append("generated_artifact") - - if ( - identifier_query - and not is_test_file(result.path) - and not is_generated_artifact_path(result.path) - and _result_defines_identifier(result, identifier_query) - ): - if _is_source_implementation_path(result.path): - boost_multiplier *= 2.0 - boost_reasons.append("source_definition") - else: - boost_multiplier *= 1.35 - boost_reasons.append("symbol_definition") - - if ( - (query_intent in {QueryIntent.SEMANTIC, QueryIntent.MIXED} or keyword_path_query) - and not skip_test_penalty - and not skip_auxiliary_penalty - and not skip_generated_penalty - and not is_test_file(result.path) - and not is_generated_artifact_path(result.path) - and not is_auxiliary_reference_path(result.path) - and _is_source_implementation_path(result.path) - ): - semantic_path_boost, overlap_tokens = _source_path_topic_boost( - query, - result.path, - query_intent, - ) - if semantic_path_boost > 1.0: - boost_multiplier *= semantic_path_boost - boost_reasons.append("source_path_topic_overlap") - - multiplier = penalty_multiplier * boost_multiplier - if penalty_reasons or boost_reasons: - metadata = { - **result.metadata, - "path_rank_multiplier": multiplier, - } - if penalty_reasons: - metadata["path_penalty_reasons"] = penalty_reasons - metadata["path_penalty_multiplier"] = penalty_multiplier - if boost_reasons: - metadata["path_boost_reasons"] = boost_reasons - metadata["path_boost_multiplier"] = boost_multiplier - if "source_path_topic_overlap" in boost_reasons and overlap_tokens: - metadata["path_boost_overlap_tokens"] = overlap_tokens - penalized.append( - result.model_copy( - deep=True, - update={ - "score": max(0.0, float(result.score) * multiplier), - "metadata": metadata, - }, - ) - ) - else: - penalized.append(result) - - penalized.sort(key=lambda r: r.score, reverse=True) - return penalized - - -def rebalance_noisy_results( - results: List[SearchResult], - query: str, -) -> List[SearchResult]: - """Move noisy test/generated/auxiliary results behind implementation hits when safe.""" - if not results: - return [] - - query_intent = detect_query_intent(query) - skip_test_penalty = query_targets_test_files(query) - skip_auxiliary_penalty = query_targets_auxiliary_files(query) - skip_generated_penalty = query_targets_generated_files(query) - query_topic_tokens = _semantic_query_topic_tokens(query) - keyword_path_query = query_intent == QueryIntent.KEYWORD and len(query_topic_tokens) >= 2 - explicit_feature_query = bool(extract_explicit_path_hints(query)) - source_oriented_query = ( - explicit_feature_query - or keyword_path_query - or ( - query_intent in {QueryIntent.SEMANTIC, QueryIntent.MIXED} - and len(query_topic_tokens) >= 2 - ) - ) - if not source_oriented_query: - return results - - max_generated_results = len(results) if skip_generated_penalty else 0 - max_test_results = len(results) if skip_test_penalty else (0 if explicit_feature_query else 1) - max_auxiliary_results = len(results) if skip_auxiliary_penalty else (0 if explicit_feature_query else 1) - - selected: List[SearchResult] = [] - deferred: List[SearchResult] = [] - generated_count = 0 - test_count = 0 - auxiliary_count = 0 - - for result in results: - if not skip_generated_penalty and is_generated_artifact_path(result.path): - if generated_count >= max_generated_results: - deferred.append(result) - continue - generated_count += 1 - selected.append(result) - continue - - if not skip_test_penalty and is_test_file(result.path): - if test_count >= max_test_results: - deferred.append(result) - continue - test_count += 1 - selected.append(result) - continue - - if not skip_auxiliary_penalty and is_auxiliary_reference_path(result.path): - if auxiliary_count >= max_auxiliary_results: - deferred.append(result) - continue - auxiliary_count += 1 - selected.append(result) - continue - - selected.append(result) - - return selected + deferred - - -def simple_weighted_fusion( - results_map: Dict[str, List[SearchResult]], - weights: Dict[str, float] = None, -) -> List[SearchResult]: - """Combine search results using simple weighted sum of normalized scores. - - This is an alternative to RRF that preserves score magnitude information. - Scores are min-max normalized per source before weighted combination. - - Formula: score(d) = Σ weight_source * normalized_score_source(d) - - Args: - results_map: Dictionary mapping source name to list of SearchResult objects - Sources: 'exact', 'fuzzy', 'vector' - weights: Dictionary mapping source name to weight (default: equal weights) - Example: {'exact': 0.3, 'fuzzy': 0.1, 'vector': 0.6} - - Returns: - List of SearchResult objects sorted by fused score (descending) - - Examples: - >>> fts_results = [SearchResult(path="a.py", score=10.0, excerpt="...")] - >>> vector_results = [SearchResult(path="b.py", score=0.85, excerpt="...")] - >>> results_map = {'exact': fts_results, 'vector': vector_results} - >>> fused = simple_weighted_fusion(results_map) - """ - if not results_map: - return [] - - # Default equal weights if not provided - if weights is None: - num_sources = len(results_map) - weights = {source: 1.0 / num_sources for source in results_map} - - # Normalize weights to sum to 1.0 - weight_sum = sum(weights.values()) - if not math.isclose(weight_sum, 1.0, abs_tol=0.01) and weight_sum > 0: - weights = {source: w / weight_sum for source, w in weights.items()} - - # Compute min-max normalization parameters per source - source_stats: Dict[str, tuple] = {} - for source_name, results in results_map.items(): - if not results: - continue - scores = [r.score for r in results] - min_s, max_s = min(scores), max(scores) - source_stats[source_name] = (min_s, max_s) - - def normalize_score(score: float, source: str) -> float: - """Normalize score to [0, 1] range using min-max scaling.""" - if source not in source_stats: - return 0.0 - min_s, max_s = source_stats[source] - if max_s == min_s: - return 1.0 if score >= min_s else 0.0 - return (score - min_s) / (max_s - min_s) - - # Build unified result set with weighted scores - path_to_result: Dict[str, SearchResult] = {} - path_to_fusion_score: Dict[str, float] = {} - path_to_source_scores: Dict[str, Dict[str, float]] = {} - - for source_name, results in results_map.items(): - weight = weights.get(source_name, 0.0) - if weight == 0: - continue - - for result in results: - path = result.path - normalized = normalize_score(result.score, source_name) - contribution = weight * normalized - - if path not in path_to_fusion_score: - path_to_fusion_score[path] = 0.0 - path_to_result[path] = result - path_to_source_scores[path] = {} - - path_to_fusion_score[path] += contribution - path_to_source_scores[path][source_name] = normalized - - # Create final results with fusion scores - fused_results = [] - for path, base_result in path_to_result.items(): - fusion_score = path_to_fusion_score[path] - - fused_result = SearchResult( - path=base_result.path, - score=fusion_score, - excerpt=base_result.excerpt, - content=base_result.content, - symbol=base_result.symbol, - chunk=base_result.chunk, - metadata={ - **base_result.metadata, - "fusion_method": "simple_weighted", - "fusion_score": fusion_score, - "original_score": base_result.score, - "source_scores": path_to_source_scores[path], - }, - start_line=base_result.start_line, - end_line=base_result.end_line, - symbol_name=base_result.symbol_name, - symbol_kind=base_result.symbol_kind, - ) - fused_results.append(fused_result) - - fused_results.sort(key=lambda r: r.score, reverse=True) - return fused_results - - -def reciprocal_rank_fusion( - results_map: Dict[str, List[SearchResult]], - weights: Dict[str, float] = None, - k: int = 60, -) -> List[SearchResult]: - """Combine search results from multiple sources using Reciprocal Rank Fusion. - - RRF formula: score(d) = Σ weight_source / (k + rank_source(d)) - - Args: - results_map: Dictionary mapping source name to list of SearchResult objects - Sources: 'exact', 'fuzzy', 'vector' - weights: Dictionary mapping source name to weight (default: equal weights) - Example: {'exact': 0.3, 'fuzzy': 0.1, 'vector': 0.6} - k: Constant to avoid division by zero and control rank influence (default 60) - - Returns: - List of SearchResult objects sorted by fused score (descending) - - Examples: - >>> exact_results = [SearchResult(path="a.py", score=10.0, excerpt="...")] - >>> fuzzy_results = [SearchResult(path="b.py", score=8.0, excerpt="...")] - >>> results_map = {'exact': exact_results, 'fuzzy': fuzzy_results} - >>> fused = reciprocal_rank_fusion(results_map) - """ - if not results_map: - return [] - - # Default equal weights if not provided - if weights is None: - num_sources = len(results_map) - weights = {source: 1.0 / num_sources for source in results_map} - - # Validate weights sum to 1.0 - weight_sum = sum(weights.values()) - if not math.isclose(weight_sum, 1.0, abs_tol=0.01): - # Normalize weights to sum to 1.0 - weights = {source: w / weight_sum for source, w in weights.items()} - - # Build unified result set with RRF scores - path_to_result: Dict[str, SearchResult] = {} - path_to_fusion_score: Dict[str, float] = {} - path_to_source_ranks: Dict[str, Dict[str, int]] = {} - - for source_name, results in results_map.items(): - weight = weights.get(source_name, 0.0) - if weight == 0: - continue - - for rank, result in enumerate(results, start=1): - path = result.path - rrf_contribution = weight / (k + rank) - - # Initialize or accumulate fusion score - if path not in path_to_fusion_score: - path_to_fusion_score[path] = 0.0 - path_to_result[path] = result - path_to_source_ranks[path] = {} - - path_to_fusion_score[path] += rrf_contribution - path_to_source_ranks[path][source_name] = rank - - # Create final results with fusion scores - fused_results = [] - for path, base_result in path_to_result.items(): - fusion_score = path_to_fusion_score[path] - - # Create new SearchResult with fusion_score in metadata - fused_result = SearchResult( - path=base_result.path, - score=fusion_score, - excerpt=base_result.excerpt, - content=base_result.content, - symbol=base_result.symbol, - chunk=base_result.chunk, - metadata={ - **base_result.metadata, - "fusion_method": "rrf", - "fusion_score": fusion_score, - "original_score": base_result.score, - "rrf_k": k, - "source_ranks": path_to_source_ranks[path], - }, - start_line=base_result.start_line, - end_line=base_result.end_line, - symbol_name=base_result.symbol_name, - symbol_kind=base_result.symbol_kind, - ) - fused_results.append(fused_result) - - # Sort by fusion score descending - fused_results.sort(key=lambda r: r.score, reverse=True) - - return fused_results - - -def apply_symbol_boost( - results: List[SearchResult], - boost_factor: float = 1.5, -) -> List[SearchResult]: - """Boost fused scores for results that include an explicit symbol match. - - The boost is multiplicative on the current result.score (typically the RRF fusion score). - When boosted, the original score is preserved in metadata["original_fusion_score"] and - metadata["boosted"] is set to True. - """ - if not results: - return [] - - if boost_factor <= 1.0: - # Still return new objects to follow immutable transformation pattern. - return [ - SearchResult( - path=r.path, - score=r.score, - excerpt=r.excerpt, - content=r.content, - symbol=r.symbol, - chunk=r.chunk, - metadata={**r.metadata}, - start_line=r.start_line, - end_line=r.end_line, - symbol_name=r.symbol_name, - symbol_kind=r.symbol_kind, - additional_locations=list(r.additional_locations), - ) - for r in results - ] - - boosted_results: List[SearchResult] = [] - for result in results: - has_symbol = bool(result.symbol_name) - original_score = float(result.score) - boosted_score = original_score * boost_factor if has_symbol else original_score - - metadata = {**result.metadata} - if has_symbol: - metadata.setdefault("original_fusion_score", metadata.get("fusion_score", original_score)) - metadata["boosted"] = True - metadata["symbol_boost_factor"] = boost_factor - - boosted_results.append( - SearchResult( - path=result.path, - score=boosted_score, - excerpt=result.excerpt, - content=result.content, - symbol=result.symbol, - chunk=result.chunk, - metadata=metadata, - start_line=result.start_line, - end_line=result.end_line, - symbol_name=result.symbol_name, - symbol_kind=result.symbol_kind, - additional_locations=list(result.additional_locations), - ) - ) - - boosted_results.sort(key=lambda r: r.score, reverse=True) - return boosted_results - - -def rerank_results( - query: str, - results: List[SearchResult], - embedder: Any, - top_k: int = 50, -) -> List[SearchResult]: - """Re-rank results with embedding cosine similarity, combined with current score. - - Combined score formula: - 0.5 * rrf_score + 0.5 * cosine_similarity - - If embedder is None or embedding fails, returns results as-is. - """ - if not results: - return [] - - if embedder is None or top_k <= 0: - return results - - rerank_count = min(int(top_k), len(results)) - - def cosine_similarity(vec_a: List[float], vec_b: List[float]) -> float: - # Defensive: handle mismatched lengths and zero vectors. - n = min(len(vec_a), len(vec_b)) - if n == 0: - return 0.0 - dot = 0.0 - norm_a = 0.0 - norm_b = 0.0 - for i in range(n): - a = float(vec_a[i]) - b = float(vec_b[i]) - dot += a * b - norm_a += a * a - norm_b += b * b - if norm_a <= 0.0 or norm_b <= 0.0: - return 0.0 - sim = dot / (math.sqrt(norm_a) * math.sqrt(norm_b)) - # SearchResult.score requires non-negative scores; clamp cosine similarity to [0, 1]. - return max(0.0, min(1.0, sim)) - - def text_for_embedding(r: SearchResult) -> str: - if r.excerpt and r.excerpt.strip(): - return r.excerpt - if r.content and r.content.strip(): - return r.content - if r.chunk and r.chunk.content and r.chunk.content.strip(): - return r.chunk.content - # Fallback: stable, non-empty text. - return r.symbol_name or r.path - - try: - if hasattr(embedder, "embed_single"): - query_vec = embedder.embed_single(query) - else: - query_vec = embedder.embed(query)[0] - - doc_texts = [text_for_embedding(r) for r in results[:rerank_count]] - doc_vecs = embedder.embed(doc_texts) - except Exception: - return results - - reranked_results: List[SearchResult] = [] - - for idx, result in enumerate(results): - if idx < rerank_count: - rrf_score = float(result.score) - sim = cosine_similarity(query_vec, doc_vecs[idx]) - combined_score = 0.5 * rrf_score + 0.5 * sim - - reranked_results.append( - SearchResult( - path=result.path, - score=combined_score, - excerpt=result.excerpt, - content=result.content, - symbol=result.symbol, - chunk=result.chunk, - metadata={ - **result.metadata, - "rrf_score": rrf_score, - "cosine_similarity": sim, - "reranked": True, - }, - start_line=result.start_line, - end_line=result.end_line, - symbol_name=result.symbol_name, - symbol_kind=result.symbol_kind, - additional_locations=list(result.additional_locations), - ) - ) - else: - # Preserve remaining results without re-ranking, but keep immutability. - reranked_results.append( - SearchResult( - path=result.path, - score=result.score, - excerpt=result.excerpt, - content=result.content, - symbol=result.symbol, - chunk=result.chunk, - metadata={**result.metadata}, - start_line=result.start_line, - end_line=result.end_line, - symbol_name=result.symbol_name, - symbol_kind=result.symbol_kind, - additional_locations=list(result.additional_locations), - ) - ) - - reranked_results.sort(key=lambda r: r.score, reverse=True) - return reranked_results - - -def cross_encoder_rerank( - query: str, - results: List[SearchResult], - reranker: Any, - top_k: int = 50, - batch_size: int = 32, - chunk_type_weights: Optional[Dict[str, float]] = None, - test_file_penalty: float = 0.0, -) -> List[SearchResult]: - """Second-stage reranking using a cross-encoder model. - - This function is dependency-agnostic: callers can pass any object that exposes - a compatible `score_pairs(pairs, batch_size=...)` method. - - Args: - query: Search query string - results: List of search results to rerank - reranker: Cross-encoder model with score_pairs or predict method - top_k: Number of top results to rerank - batch_size: Batch size for reranking - chunk_type_weights: Optional weights for different chunk types. - Example: {"code": 1.0, "docstring": 0.7} - reduce docstring influence - test_file_penalty: Penalty applied to test files (0.0-1.0). - Example: 0.2 means test files get 20% score reduction - """ - if not results: - return [] - - if reranker is None or top_k <= 0: - return results - - rerank_count = min(int(top_k), len(results)) - - def text_for_pair(r: SearchResult) -> str: - if r.excerpt and r.excerpt.strip(): - return r.excerpt - if r.content and r.content.strip(): - return r.content - if r.chunk and r.chunk.content and r.chunk.content.strip(): - return r.chunk.content - return r.symbol_name or r.path - - pairs = [(query, text_for_pair(r)) for r in results[:rerank_count]] - - try: - if hasattr(reranker, "score_pairs"): - raw_scores = reranker.score_pairs(pairs, batch_size=int(batch_size)) - elif hasattr(reranker, "predict"): - raw_scores = reranker.predict(pairs, batch_size=int(batch_size)) - else: - return results - except Exception as exc: - logger.debug("Cross-encoder rerank failed; returning original ranking: %s", exc) - return results - - if not raw_scores or len(raw_scores) != rerank_count: - logger.debug( - "Cross-encoder rerank returned %d scores for %d candidates; returning original ranking", - len(raw_scores) if raw_scores else 0, - rerank_count, - ) - return results - - scores = [float(s) for s in raw_scores] - min_s = min(scores) - max_s = max(scores) - - def sigmoid(x: float) -> float: - # Clamp to keep exp() stable. - x = max(-50.0, min(50.0, x)) - return 1.0 / (1.0 + math.exp(-x)) - - if 0.0 <= min_s and max_s <= 1.0: - probs = scores - else: - probs = [sigmoid(s) for s in scores] - - query_intent = detect_query_intent(query) - skip_test_penalty = query_targets_test_files(query) - skip_auxiliary_penalty = query_targets_auxiliary_files(query) - skip_generated_penalty = query_targets_generated_files(query) - keyword_path_query = query_intent == QueryIntent.KEYWORD and len(_semantic_query_topic_tokens(query)) >= 2 - reranked_results: List[SearchResult] = [] - - for idx, result in enumerate(results): - if idx < rerank_count: - prev_score = float(result.score) - ce_score = scores[idx] - ce_prob = probs[idx] - - # Base combined score - combined_score = 0.5 * prev_score + 0.5 * ce_prob - - # Apply chunk_type weight adjustment - if chunk_type_weights: - chunk_type = None - if result.chunk and hasattr(result.chunk, "metadata"): - chunk_type = result.chunk.metadata.get("chunk_type") - elif result.metadata: - chunk_type = result.metadata.get("chunk_type") - - if chunk_type and chunk_type in chunk_type_weights: - weight = chunk_type_weights[chunk_type] - # Apply weight to CE contribution only - combined_score = 0.5 * prev_score + 0.5 * ce_prob * weight - - # Apply test file penalty - if test_file_penalty > 0 and is_test_file(result.path): - combined_score = combined_score * (1.0 - test_file_penalty) - - cross_encoder_floor_reason = None - cross_encoder_floor_score = None - cross_encoder_floor_overlap_tokens: List[str] = [] - if ( - (query_intent in {QueryIntent.SEMANTIC, QueryIntent.MIXED} or keyword_path_query) - and not skip_test_penalty - and not skip_auxiliary_penalty - and not skip_generated_penalty - and not is_test_file(result.path) - and not is_generated_artifact_path(result.path) - and not is_auxiliary_reference_path(result.path) - and _is_source_implementation_path(result.path) - ): - semantic_path_boost, overlap_tokens = _source_path_topic_boost( - query, - result.path, - query_intent, - ) - if semantic_path_boost > 1.0: - floor_ratio = 0.8 if semantic_path_boost >= 1.35 else 0.75 - candidate_floor = prev_score * floor_ratio - if candidate_floor > combined_score: - combined_score = candidate_floor - cross_encoder_floor_reason = ( - "keyword_source_path_overlap" - if query_intent == QueryIntent.KEYWORD - else "semantic_source_path_overlap" - ) - cross_encoder_floor_score = candidate_floor - cross_encoder_floor_overlap_tokens = overlap_tokens - - metadata = { - **result.metadata, - "pre_cross_encoder_score": prev_score, - "cross_encoder_score": ce_score, - "cross_encoder_prob": ce_prob, - "cross_encoder_reranked": True, - } - if cross_encoder_floor_reason is not None: - metadata["cross_encoder_floor_reason"] = cross_encoder_floor_reason - metadata["cross_encoder_floor_score"] = cross_encoder_floor_score - if cross_encoder_floor_overlap_tokens: - metadata["cross_encoder_floor_overlap_tokens"] = ( - cross_encoder_floor_overlap_tokens - ) - - reranked_results.append( - SearchResult( - path=result.path, - score=combined_score, - excerpt=result.excerpt, - content=result.content, - symbol=result.symbol, - chunk=result.chunk, - metadata=metadata, - start_line=result.start_line, - end_line=result.end_line, - symbol_name=result.symbol_name, - symbol_kind=result.symbol_kind, - additional_locations=list(result.additional_locations), - ) - ) - else: - reranked_results.append( - SearchResult( - path=result.path, - score=result.score, - excerpt=result.excerpt, - content=result.content, - symbol=result.symbol, - chunk=result.chunk, - metadata={**result.metadata}, - start_line=result.start_line, - end_line=result.end_line, - symbol_name=result.symbol_name, - symbol_kind=result.symbol_kind, - additional_locations=list(result.additional_locations), - ) - ) - - reranked_results.sort(key=lambda r: r.score, reverse=True) - return reranked_results - - -def normalize_bm25_score(score: float) -> float: - """Normalize BM25 scores from SQLite FTS5 to 0-1 range. - - SQLite FTS5 returns negative BM25 scores (more negative = better match). - Uses sigmoid transformation for normalization. - - Args: - score: Raw BM25 score from SQLite (typically negative) - - Returns: - Normalized score in range [0, 1] - - Examples: - >>> normalize_bm25_score(-10.5) # Good match - 0.85 - >>> normalize_bm25_score(-1.2) # Weak match - 0.62 - """ - # Take absolute value (BM25 is negative in SQLite) - abs_score = abs(score) - - # Sigmoid transformation: 1 / (1 + e^(-x)) - # Scale factor of 0.1 maps typical BM25 range (-20 to 0) to (0, 1) - normalized = 1.0 / (1.0 + math.exp(-abs_score * 0.1)) - - return normalized - - -def tag_search_source(results: List[SearchResult], source: str) -> List[SearchResult]: - """Tag search results with their source for RRF tracking. - - Args: - results: List of SearchResult objects - source: Source identifier ('exact', 'fuzzy', 'vector') - - Returns: - List of SearchResult objects with 'search_source' in metadata - """ - tagged_results = [] - for result in results: - tagged_result = SearchResult( - path=result.path, - score=result.score, - excerpt=result.excerpt, - content=result.content, - symbol=result.symbol, - chunk=result.chunk, - metadata={**result.metadata, "search_source": source}, - start_line=result.start_line, - end_line=result.end_line, - symbol_name=result.symbol_name, - symbol_kind=result.symbol_kind, - ) - tagged_results.append(tagged_result) - - return tagged_results - - -def group_similar_results( - results: List[SearchResult], - score_threshold_abs: float = 0.01, - content_field: str = "excerpt" -) -> List[SearchResult]: - """Group search results by content and score similarity. - - Groups results that have similar content and similar scores into a single - representative result, with other locations stored in additional_locations. - - Algorithm: - 1. Group results by content (using excerpt or content field) - 2. Within each content group, create subgroups based on score similarity - 3. Select highest-scoring result as representative for each subgroup - 4. Store other results in subgroup as additional_locations - - Args: - results: A list of SearchResult objects (typically sorted by score) - score_threshold_abs: Absolute score difference to consider results similar. - Results with |score_a - score_b| <= threshold are grouped. - Default 0.01 is suitable for RRF fusion scores. - content_field: The field to use for content grouping ('excerpt' or 'content') - - Returns: - A new list of SearchResult objects where similar items are grouped. - The list is sorted by score descending. - - Examples: - >>> results = [SearchResult(path="a.py", score=0.5, excerpt="def foo()"), - ... SearchResult(path="b.py", score=0.5, excerpt="def foo()")] - >>> grouped = group_similar_results(results) - >>> len(grouped) # Two results merged into one - 1 - >>> len(grouped[0].additional_locations) # One additional location - 1 - """ - if not results: - return [] - - # Group results by content - content_map: Dict[str, List[SearchResult]] = {} - unidentifiable_results: List[SearchResult] = [] - - for r in results: - key = getattr(r, content_field, None) - if key and key.strip(): - content_map.setdefault(key, []).append(r) - else: - # Results without content can't be grouped by content - unidentifiable_results.append(r) - - final_results: List[SearchResult] = [] - - # Process each content group - for content_group in content_map.values(): - # Sort by score descending within group - content_group.sort(key=lambda r: r.score, reverse=True) - - while content_group: - # Take highest scoring as representative - representative = content_group.pop(0) - others_in_group = [] - remaining_for_next_pass = [] - - # Find results with similar scores - for item in content_group: - if abs(representative.score - item.score) <= score_threshold_abs: - others_in_group.append(item) - else: - remaining_for_next_pass.append(item) - - # Create grouped result with additional locations - if others_in_group: - # Build new result with additional_locations populated - grouped_result = SearchResult( - path=representative.path, - score=representative.score, - excerpt=representative.excerpt, - content=representative.content, - symbol=representative.symbol, - chunk=representative.chunk, - metadata={ - **representative.metadata, - "grouped_count": len(others_in_group) + 1, - }, - start_line=representative.start_line, - end_line=representative.end_line, - symbol_name=representative.symbol_name, - symbol_kind=representative.symbol_kind, - additional_locations=[ - AdditionalLocation( - path=other.path, - score=other.score, - start_line=other.start_line, - end_line=other.end_line, - symbol_name=other.symbol_name, - ) for other in others_in_group - ], - ) - final_results.append(grouped_result) - else: - final_results.append(representative) - - content_group = remaining_for_next_pass - - # Add ungroupable results - final_results.extend(unidentifiable_results) - - # Sort final results by score descending - final_results.sort(key=lambda r: r.score, reverse=True) - - return final_results diff --git a/codex-lens/src/codexlens/semantic/__init__.py b/codex-lens/src/codexlens/semantic/__init__.py deleted file mode 100644 index b9bd040f..00000000 --- a/codex-lens/src/codexlens/semantic/__init__.py +++ /dev/null @@ -1,118 +0,0 @@ -"""Optional semantic search module for CodexLens. - -Install with: pip install codexlens[semantic] -Uses fastembed (ONNX-based, lightweight ~200MB) - -GPU Acceleration: -- Automatic GPU detection and usage when available -- Supports CUDA (NVIDIA), TensorRT, DirectML (Windows), ROCm (AMD), CoreML (Apple) -- Install GPU support: pip install onnxruntime-gpu (NVIDIA) or onnxruntime-directml (Windows) -""" - -from __future__ import annotations - -SEMANTIC_AVAILABLE = False -SEMANTIC_BACKEND: str | None = None -GPU_AVAILABLE = False -LITELLM_AVAILABLE = False -_import_error: str | None = None - - -def _detect_backend() -> tuple[bool, str | None, bool, str | None]: - """Detect if fastembed and GPU are available.""" - try: - import numpy as np - except ImportError as e: - return False, None, False, f"numpy not available: {e}" - - try: - from fastembed import TextEmbedding - except ImportError: - return False, None, False, "fastembed not available. Install with: pip install codexlens[semantic]" - - # Check GPU availability - gpu_available = False - try: - from .gpu_support import is_gpu_available - gpu_available = is_gpu_available() - except ImportError: - pass - - return True, "fastembed", gpu_available, None - - -# Initialize on module load -SEMANTIC_AVAILABLE, SEMANTIC_BACKEND, GPU_AVAILABLE, _import_error = _detect_backend() - - -def check_semantic_available() -> tuple[bool, str | None]: - """Check if semantic search dependencies are available.""" - return SEMANTIC_AVAILABLE, _import_error - - -def check_gpu_available() -> tuple[bool, str]: - """Check if GPU acceleration is available. - - Returns: - Tuple of (is_available, status_message) - """ - if not SEMANTIC_AVAILABLE: - return False, "Semantic search not available" - - try: - from .gpu_support import is_gpu_available, get_gpu_summary - if is_gpu_available(): - return True, get_gpu_summary() - return False, "No GPU detected (using CPU)" - except ImportError: - return False, "GPU support module not available" - - -# Export embedder components -# BaseEmbedder is always available (abstract base class) -from .base import BaseEmbedder - -# Factory function for creating embedders -from .factory import get_embedder as get_embedder_factory - -# Optional: LiteLLMEmbedderWrapper (only if ccw-litellm is installed) -try: - import ccw_litellm # noqa: F401 - from .litellm_embedder import LiteLLMEmbedderWrapper - LITELLM_AVAILABLE = True -except ImportError: - LiteLLMEmbedderWrapper = None - LITELLM_AVAILABLE = False - - -def is_embedding_backend_available(backend: str) -> tuple[bool, str | None]: - """Check whether a specific embedding backend can be used. - - Notes: - - "fastembed" requires the optional semantic deps (pip install codexlens[semantic]). - - "litellm" requires ccw-litellm to be installed in the same environment. - """ - backend = (backend or "").strip().lower() - if backend == "fastembed": - if SEMANTIC_AVAILABLE: - return True, None - return False, _import_error or "fastembed not available. Install with: pip install codexlens[semantic]" - if backend == "litellm": - if LITELLM_AVAILABLE: - return True, None - return False, "ccw-litellm not available. Install with: pip install ccw-litellm" - return False, f"Invalid embedding backend: {backend}. Must be 'fastembed' or 'litellm'." - - -__all__ = [ - "SEMANTIC_AVAILABLE", - "SEMANTIC_BACKEND", - "GPU_AVAILABLE", - "LITELLM_AVAILABLE", - "check_semantic_available", - "is_embedding_backend_available", - "check_gpu_available", - "BaseEmbedder", - "get_embedder_factory", - "LiteLLMEmbedderWrapper", -] diff --git a/codex-lens/src/codexlens/semantic/ann_index.py b/codex-lens/src/codexlens/semantic/ann_index.py deleted file mode 100644 index f5280c0e..00000000 --- a/codex-lens/src/codexlens/semantic/ann_index.py +++ /dev/null @@ -1,1097 +0,0 @@ -"""Approximate Nearest Neighbor (ANN) index using HNSW algorithm. - -Provides O(log N) similarity search using hnswlib's Hierarchical Navigable Small World graphs. -Falls back to brute-force search when hnswlib is not available. - -Key features: -- HNSW index for fast approximate nearest neighbor search -- Persistent index storage (saved alongside SQLite database) -- Incremental vector addition and deletion -- Thread-safe operations -- Cosine similarity metric -- Support for centralized storage mode (single index at project root) -""" - -from __future__ import annotations - -import logging -import threading -from pathlib import Path -from typing import List, Optional, Tuple - -from codexlens.errors import StorageError -from codexlens.config import VECTORS_HNSW_NAME - -from . import SEMANTIC_AVAILABLE - -if SEMANTIC_AVAILABLE: - import numpy as np - -logger = logging.getLogger(__name__) - -# Try to import hnswlib (optional dependency) -try: - import hnswlib - - HNSWLIB_AVAILABLE = True -except ImportError: - HNSWLIB_AVAILABLE = False - - -class ANNIndex: - """HNSW-based approximate nearest neighbor index for vector similarity search. - - Performance characteristics: - - Build time: O(N log N) where N is number of vectors - - Search time: O(log N) approximate - - Memory: ~(M * 2 * 4 * d) bytes per vector (M=16, d=dimension) - - Index parameters: - - space: cosine (cosine similarity metric) - - M: 16 (max connections per node - balance between speed and recall) - - ef_construction: 200 (search width during build - higher = better quality) - - ef: 50 (search width during query - higher = better recall) - """ - - def __init__( - self, - index_path: Path, - dim: int, - initial_capacity: int = 50000, - auto_save: bool = False, - expansion_threshold: float = 0.8, - ) -> None: - """Initialize ANN index. - - Args: - index_path: Path to SQLite database (index will be saved as _vectors.hnsw) - dim: Dimension of embedding vectors - initial_capacity: Initial maximum elements capacity (default: 50000) - auto_save: Whether to automatically save index after operations (default: False) - expansion_threshold: Capacity threshold to trigger auto-expansion (default: 0.8) - - Raises: - ImportError: If required dependencies are not available - ValueError: If dimension or capacity is invalid - """ - if not SEMANTIC_AVAILABLE: - raise ImportError( - "Semantic search dependencies not available. " - "Install with: pip install codexlens[semantic]" - ) - - if not HNSWLIB_AVAILABLE: - raise ImportError( - "hnswlib is required for ANN index. " - "Install with: pip install hnswlib" - ) - - if dim <= 0: - raise ValueError(f"Invalid dimension: {dim}") - - if initial_capacity <= 0: - raise ValueError(f"Invalid initial capacity: {initial_capacity}") - - if not 0.0 < expansion_threshold < 1.0: - raise ValueError( - f"Invalid expansion threshold: {expansion_threshold}. Must be between 0 and 1." - ) - - self.index_path = Path(index_path) - self.dim = dim - - # Derive HNSW index path from database path - # e.g., /path/to/_index.db -> /path/to/_index_vectors.hnsw - # This ensures unique HNSW files for each database - db_stem = self.index_path.stem # e.g., "_index" or "tmp123" - self.hnsw_path = self.index_path.parent / f"{db_stem}_vectors.hnsw" - - # HNSW parameters - self.space = "cosine" # Cosine similarity metric - self.M = 16 # Max connections per node (16 is good balance) - self.ef_construction = 200 # Build-time search width (higher = better quality) - self.ef = 50 # Query-time search width (higher = better recall) - - # Memory management parameters - self._auto_save = auto_save - self._expansion_threshold = expansion_threshold - - # Thread safety - self._lock = threading.RLock() - - # HNSW index instance - self._index: Optional[hnswlib.Index] = None - self._max_elements = initial_capacity # Initial capacity (reduced from 1M to 50K) - self._current_count = 0 # Track number of vectors - - logger.info( - f"Initialized ANNIndex with capacity={initial_capacity}, " - f"auto_save={auto_save}, expansion_threshold={expansion_threshold}" - ) - - @classmethod - def create_central( - cls, - index_root: Path, - dim: int, - initial_capacity: int = 50000, - auto_save: bool = False, - expansion_threshold: float = 0.8, - ) -> "ANNIndex": - """Create a centralized ANN index at the project index root. - - This method creates a single shared HNSW index file at the project root, - rather than per-directory indexes. Use this for projects that want all - dense vectors stored in one central location. - - Args: - index_root: Root directory for the index (e.g., .codexlens//) - dim: Dimension of embedding vectors - initial_capacity: Initial maximum elements capacity (default: 50000) - auto_save: Whether to automatically save index after operations (default: False) - expansion_threshold: Capacity threshold to trigger auto-expansion (default: 0.8) - - Returns: - ANNIndex instance configured for centralized storage - - Example: - >>> index = ANNIndex.create_central(Path(".codexlens/abc123"), dim=768) - >>> index.hnsw_path # Returns: .codexlens/abc123/_vectors.hnsw - """ - # Create a dummy index_path that will result in the central hnsw_path - # The index_path is used to derive hnsw_path, so we create a virtual path - # such that self.hnsw_path = index_root / VECTORS_HNSW_NAME - instance = cls.__new__(cls) - - if not SEMANTIC_AVAILABLE: - raise ImportError( - "Semantic search dependencies not available. " - "Install with: pip install codexlens[semantic]" - ) - - if not HNSWLIB_AVAILABLE: - raise ImportError( - "hnswlib is required for ANN index. " - "Install with: pip install hnswlib" - ) - - if dim <= 0: - raise ValueError(f"Invalid dimension: {dim}") - - if initial_capacity <= 0: - raise ValueError(f"Invalid initial capacity: {initial_capacity}") - - if not 0.0 < expansion_threshold < 1.0: - raise ValueError( - f"Invalid expansion threshold: {expansion_threshold}. Must be between 0 and 1." - ) - - instance.index_path = index_root - instance.dim = dim - - # Centralized mode: use VECTORS_HNSW_NAME directly at index_root - instance.hnsw_path = index_root / VECTORS_HNSW_NAME - - # HNSW parameters - instance.space = "cosine" - instance.M = 16 - instance.ef_construction = 200 - instance.ef = 50 - - # Memory management parameters - instance._auto_save = auto_save - instance._expansion_threshold = expansion_threshold - - # Thread safety - instance._lock = threading.RLock() - - # HNSW index instance - instance._index: Optional[hnswlib.Index] = None - instance._max_elements = initial_capacity - instance._current_count = 0 - - logger.info( - f"Initialized centralized ANNIndex at {instance.hnsw_path} with " - f"capacity={initial_capacity}, auto_save={auto_save}" - ) - - return instance - - def _ensure_index(self) -> None: - """Ensure HNSW index is initialized (lazy initialization).""" - if self._index is None: - self._index = hnswlib.Index(space=self.space, dim=self.dim) - self._index.init_index( - max_elements=self._max_elements, - ef_construction=self.ef_construction, - M=self.M, - ) - self._index.set_ef(self.ef) - self._current_count = 0 - logger.debug(f"Created new HNSW index with capacity {self._max_elements}") - - def _auto_expand_if_needed(self, additional_count: int) -> None: - """Auto-expand index capacity if threshold is reached. - - Args: - additional_count: Number of vectors to be added - - Note: - This is called internally by add_vectors and is thread-safe. - """ - usage_ratio = (self._current_count + additional_count) / self._max_elements - - if usage_ratio >= self._expansion_threshold: - # Calculate new capacity (2x current or enough to fit new vectors) - new_capacity = max( - self._max_elements * 2, - self._current_count + additional_count, - ) - - logger.info( - f"Expanding index capacity: {self._max_elements} -> {new_capacity} " - f"(usage: {usage_ratio:.1%}, threshold: {self._expansion_threshold:.1%})" - ) - - self._index.resize_index(new_capacity) - self._max_elements = new_capacity - - def add_vectors(self, ids: List[int], vectors: np.ndarray) -> None: - """Add vectors to the index. - - Args: - ids: List of vector IDs (must be unique) - vectors: Numpy array of shape (N, dim) where N = len(ids) - - Raises: - ValueError: If shapes don't match or vectors are invalid - StorageError: If index operation fails - """ - if len(ids) == 0: - return - - if vectors.shape[0] != len(ids): - raise ValueError( - f"Number of vectors ({vectors.shape[0]}) must match number of IDs ({len(ids)})" - ) - - if vectors.shape[1] != self.dim: - raise ValueError( - f"Vector dimension ({vectors.shape[1]}) must match index dimension ({self.dim})" - ) - - with self._lock: - try: - self._ensure_index() - - # Auto-expand if threshold reached - self._auto_expand_if_needed(len(ids)) - - # Ensure vectors are C-contiguous float32 (hnswlib requirement) - if not vectors.flags['C_CONTIGUOUS'] or vectors.dtype != np.float32: - vectors = np.ascontiguousarray(vectors, dtype=np.float32) - - # Add vectors to index - self._index.add_items(vectors, ids) - self._current_count += len(ids) - - logger.debug( - f"Added {len(ids)} vectors to index " - f"(total: {self._current_count}/{self._max_elements})" - ) - - # Auto-save if enabled - if self._auto_save: - self.save() - - except Exception as e: - raise StorageError(f"Failed to add vectors to ANN index: {e}") - - def remove_vectors(self, ids: List[int]) -> None: - """Remove vectors from the index by marking them as deleted. - - Note: hnswlib uses soft deletion (mark_deleted). Vectors are not - physically removed but will be excluded from search results. - - Args: - ids: List of vector IDs to remove - - Raises: - StorageError: If index operation fails - """ - if len(ids) == 0: - return - - with self._lock: - try: - if self._index is None or self._current_count == 0: - return # Nothing to remove - - # Mark vectors as deleted - deleted_count = 0 - for vec_id in ids: - try: - self._index.mark_deleted(vec_id) - deleted_count += 1 - except RuntimeError: - # ID not found - ignore (idempotent deletion) - pass - - logger.debug(f"Marked {deleted_count}/{len(ids)} vectors as deleted") - - # Auto-save if enabled - if self._auto_save and deleted_count > 0: - self.save() - - except Exception as e: - raise StorageError(f"Failed to remove vectors from ANN index: {e}") - - def search( - self, query: np.ndarray, top_k: int = 10 - ) -> Tuple[List[int], List[float]]: - """Search for nearest neighbors. - - Args: - query: Query vector of shape (dim,) or (1, dim) - top_k: Number of nearest neighbors to return - - Returns: - Tuple of (ids, distances) where: - - ids: List of vector IDs ordered by similarity - - distances: List of cosine distances (lower = more similar) - - Raises: - ValueError: If query shape is invalid - StorageError: If search operation fails - """ - # Validate query shape - if query.ndim == 1: - query = query.reshape(1, -1) - - if query.shape[0] != 1: - raise ValueError( - f"Query must be a single vector, got shape {query.shape}" - ) - - if query.shape[1] != self.dim: - raise ValueError( - f"Query dimension ({query.shape[1]}) must match index dimension ({self.dim})" - ) - - with self._lock: - try: - if self._index is None or self._current_count == 0: - return [], [] # Empty index - - effective_k = min(max(int(top_k), 0), self._current_count) - if effective_k == 0: - return [], [] - - try: - self._index.set_ef(max(self.ef, effective_k)) - except Exception: - pass - - while True: - try: - labels, distances = self._index.knn_query(query, k=effective_k) - break - except Exception as exc: - if "contiguous 2D array" in str(exc) and effective_k > 1: - next_k = max(1, effective_k // 2) - logger.debug( - "ANN search knn_query failed for k=%d; retrying with k=%d: %s", - effective_k, - next_k, - exc, - ) - if next_k == effective_k: - raise - effective_k = next_k - try: - self._index.set_ef(max(self.ef, effective_k)) - except Exception: - pass - continue - raise - - # Convert to lists and flatten (knn_query returns 2D arrays) - ids = labels[0].tolist() - dists = distances[0].tolist() - - return ids, dists - - except Exception as e: - raise StorageError(f"Failed to search ANN index: {e}") - - def save(self) -> None: - """Save index to disk. - - Index is saved to [db_path_directory]/_vectors.hnsw - - Raises: - StorageError: If save operation fails - """ - with self._lock: - try: - if self._index is None or self._current_count == 0: - logger.debug("Skipping save: index is empty") - return # Nothing to save - - # Ensure parent directory exists - self.hnsw_path.parent.mkdir(parents=True, exist_ok=True) - - # Save index - self._index.save_index(str(self.hnsw_path)) - - logger.debug( - f"Saved index to {self.hnsw_path} " - f"({self._current_count} vectors, capacity: {self._max_elements})" - ) - - except Exception as e: - raise StorageError(f"Failed to save ANN index: {e}") - - def load(self) -> bool: - """Load index from disk. - - Returns: - True if index was loaded successfully, False if index file doesn't exist - - Raises: - StorageError: If load operation fails - """ - with self._lock: - try: - if not self.hnsw_path.exists(): - logger.debug(f"Index file not found: {self.hnsw_path}") - return False # Index file doesn't exist (not an error) - - # Create fresh index object for loading (don't call init_index first) - self._index = hnswlib.Index(space=self.space, dim=self.dim) - - # Load index from disk - # Note: max_elements here is just for initial allocation, can expand later - self._index.load_index(str(self.hnsw_path), max_elements=self._max_elements) - - # Update count and capacity from loaded index - self._current_count = self._index.get_current_count() - self._max_elements = self._index.get_max_elements() - - # Set query-time ef parameter - self._index.set_ef(self.ef) - - logger.info( - f"Loaded index from {self.hnsw_path} " - f"({self._current_count} vectors, capacity: {self._max_elements})" - ) - - return True - - except Exception as e: - raise StorageError(f"Failed to load ANN index: {e}") - - def count(self) -> int: - """Get number of vectors in the index. - - Returns: - Number of vectors currently in the index - """ - with self._lock: - return self._current_count - - @property - def capacity(self) -> int: - """Get current maximum capacity of the index. - - Returns: - Maximum number of vectors the index can hold before expansion - """ - with self._lock: - return self._max_elements - - @property - def usage_ratio(self) -> float: - """Get current usage ratio (count / capacity). - - Returns: - Usage ratio between 0.0 and 1.0 - """ - with self._lock: - if self._max_elements == 0: - return 0.0 - return self._current_count / self._max_elements - - @property - def is_loaded(self) -> bool: - """Check if index is loaded and ready for use. - - Returns: - True if index is loaded, False otherwise - """ - with self._lock: - return self._index is not None and self._current_count > 0 - - - -class BinaryANNIndex: - """Binary vector ANN index using Hamming distance for fast coarse retrieval. - - .. deprecated:: - This class is deprecated. Use :class:`codexlens.search.binary_searcher.BinarySearcher` - instead, which provides faster memory-mapped search with centralized storage. - - Optimized for binary vectors (256-bit / 32 bytes per vector). - Uses packed binary representation for memory efficiency. - - Performance characteristics: - - Storage: 32 bytes per vector (vs ~8KB for dense vectors) - - Distance: Hamming distance via XOR + popcount (CPU-efficient) - - Search: O(N) brute-force with SIMD-accelerated distance computation - - Index parameters: - - dim: Binary vector dimension (default: 256) - - packed_dim: Packed bytes size (dim / 8 = 32 for 256-bit) - - Usage: - index = BinaryANNIndex(index_path, dim=256) - index.add_vectors([1, 2, 3], packed_vectors) # List of 32-byte packed vectors - ids, distances = index.search(query_packed, top_k=10) - """ - - DEFAULT_DIM = 256 # Default binary vector dimension - - def __init__( - self, - index_path: Path, - dim: int = 256, - initial_capacity: int = 100000, - auto_save: bool = False, - ) -> None: - """Initialize Binary ANN index. - - Args: - index_path: Path to database (index will be saved as _binary_vectors.bin) - dim: Dimension of binary vectors (default: 256) - initial_capacity: Initial capacity hint (default: 100000) - auto_save: Whether to automatically save index after operations - - Raises: - ImportError: If required dependencies are not available - ValueError: If dimension is invalid - """ - if not SEMANTIC_AVAILABLE: - raise ImportError( - "Semantic search dependencies not available. " - "Install with: pip install codexlens[semantic]" - ) - - import warnings - warnings.warn( - "BinaryANNIndex is deprecated. Use codexlens.search.binary_searcher.BinarySearcher " - "instead for faster memory-mapped search with centralized storage.", - DeprecationWarning, - stacklevel=2 - ) - - if dim <= 0 or dim % 8 != 0: - raise ValueError( - f"Invalid dimension: {dim}. Must be positive and divisible by 8." - ) - - self.index_path = Path(index_path) - self.dim = dim - self.packed_dim = dim // 8 # 32 bytes for 256-bit vectors - - # Derive binary index path from database path - db_stem = self.index_path.stem - self.binary_path = self.index_path.parent / f"{db_stem}_binary_vectors.bin" - - # Memory management - self._auto_save = auto_save - self._initial_capacity = initial_capacity - - # Thread safety - self._lock = threading.RLock() - - # In-memory storage: id -> packed binary vector - self._vectors: dict[int, bytes] = {} - self._id_list: list[int] = [] # Ordered list for efficient iteration - - # Cached numpy array for vectorized search (invalidated on add/remove) - self._vectors_matrix: Optional[np.ndarray] = None - self._ids_array: Optional[np.ndarray] = None - self._cache_valid: bool = False - - logger.info( - f"Initialized BinaryANNIndex with dim={dim}, packed_dim={self.packed_dim}" - ) - - def add_vectors(self, ids: List[int], vectors: List[bytes]) -> None: - """Add packed binary vectors to the index. - - Args: - ids: List of vector IDs (must be unique) - vectors: List of packed binary vectors (each of size packed_dim bytes) - - Raises: - ValueError: If shapes don't match or vectors are invalid - StorageError: If index operation fails - """ - if len(ids) == 0: - return - - if len(vectors) != len(ids): - raise ValueError( - f"Number of vectors ({len(vectors)}) must match number of IDs ({len(ids)})" - ) - - # Validate vector sizes - for i, vec in enumerate(vectors): - if len(vec) != self.packed_dim: - raise ValueError( - f"Vector {i} has size {len(vec)}, expected {self.packed_dim}" - ) - - with self._lock: - try: - for vec_id, vec in zip(ids, vectors): - if vec_id not in self._vectors: - self._id_list.append(vec_id) - self._vectors[vec_id] = vec - - # Invalidate cache on modification - self._cache_valid = False - - logger.debug( - f"Added {len(ids)} binary vectors to index (total: {len(self._vectors)})" - ) - - if self._auto_save: - self.save() - - except Exception as e: - raise StorageError(f"Failed to add vectors to Binary ANN index: {e}") - - def add_vectors_numpy(self, ids: List[int], vectors: np.ndarray) -> None: - """Add unpacked binary vectors (0/1 values) to the index. - - Convenience method that packs the vectors before adding. - - Args: - ids: List of vector IDs (must be unique) - vectors: Numpy array of shape (N, dim) with binary values (0 or 1) - - Raises: - ValueError: If shapes don't match - StorageError: If index operation fails - """ - if len(ids) == 0: - return - - if vectors.shape[0] != len(ids): - raise ValueError( - f"Number of vectors ({vectors.shape[0]}) must match number of IDs ({len(ids)})" - ) - - if vectors.shape[1] != self.dim: - raise ValueError( - f"Vector dimension ({vectors.shape[1]}) must match index dimension ({self.dim})" - ) - - # Pack vectors - packed_vectors = [] - for i in range(vectors.shape[0]): - packed = np.packbits(vectors[i].astype(np.uint8)).tobytes() - packed_vectors.append(packed) - - self.add_vectors(ids, packed_vectors) - - def remove_vectors(self, ids: List[int]) -> None: - """Remove vectors from the index. - - Args: - ids: List of vector IDs to remove - - Raises: - StorageError: If index operation fails - - Note: - Optimized for batch deletion using set operations instead of - O(N) list.remove() calls for each ID. - """ - if len(ids) == 0: - return - - with self._lock: - try: - # Use set for O(1) lookup during filtering - ids_to_remove = set(ids) - removed_count = 0 - - # Remove from dictionary - O(1) per deletion - for vec_id in ids_to_remove: - if vec_id in self._vectors: - del self._vectors[vec_id] - removed_count += 1 - - # Rebuild ID list efficiently - O(N) once instead of O(N) per removal - if removed_count > 0: - self._id_list = [id_ for id_ in self._id_list if id_ not in ids_to_remove] - # Invalidate cache on modification - self._cache_valid = False - - logger.debug(f"Removed {removed_count}/{len(ids)} vectors from index") - - if self._auto_save and removed_count > 0: - self.save() - - except Exception as e: - raise StorageError( - f"Failed to remove vectors from Binary ANN index: {e}" - ) - - def _build_cache(self) -> None: - """Build numpy array cache from vectors dict for vectorized search. - - Pre-computes a contiguous numpy array from all vectors for efficient - batch distance computation. Called lazily on first search after modification. - """ - if self._cache_valid: - return - - n_vectors = len(self._id_list) - if n_vectors == 0: - self._vectors_matrix = None - self._ids_array = None - self._cache_valid = True - return - - # Build contiguous numpy array of all packed vectors - # Shape: (n_vectors, packed_dim) with uint8 dtype - self._vectors_matrix = np.empty((n_vectors, self.packed_dim), dtype=np.uint8) - self._ids_array = np.array(self._id_list, dtype=np.int64) - - for i, vec_id in enumerate(self._id_list): - vec_bytes = self._vectors[vec_id] - self._vectors_matrix[i] = np.frombuffer(vec_bytes, dtype=np.uint8) - - self._cache_valid = True - logger.debug(f"Built vectorized cache for {n_vectors} binary vectors") - - def search( - self, query: bytes, top_k: int = 10 - ) -> Tuple[List[int], List[int]]: - """Search for nearest neighbors using Hamming distance. - - Uses vectorized batch computation for O(N) search with SIMD acceleration. - Pre-computes and caches numpy arrays for efficient repeated queries. - - Args: - query: Packed binary query vector (size: packed_dim bytes) - top_k: Number of nearest neighbors to return - - Returns: - Tuple of (ids, distances) where: - - ids: List of vector IDs ordered by Hamming distance (ascending) - - distances: List of Hamming distances (lower = more similar) - - Raises: - ValueError: If query size is invalid - StorageError: If search operation fails - """ - if len(query) != self.packed_dim: - raise ValueError( - f"Query size ({len(query)}) must match packed_dim ({self.packed_dim})" - ) - - with self._lock: - try: - if len(self._vectors) == 0: - return [], [] - - # Build cache if needed (lazy initialization) - self._build_cache() - - if self._vectors_matrix is None or self._ids_array is None: - return [], [] - - # Vectorized Hamming distance computation - # 1. Convert query to numpy array - query_arr = np.frombuffer(query, dtype=np.uint8) - - # 2. Broadcast XOR: (1, packed_dim) XOR (n_vectors, packed_dim) - # Result shape: (n_vectors, packed_dim) - xor_result = np.bitwise_xor(query_arr, self._vectors_matrix) - - # 3. Vectorized popcount using lookup table for efficiency - # np.unpackbits is slow for large arrays, use popcount LUT instead - popcount_lut = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8) - bit_counts = popcount_lut[xor_result] - - # 4. Sum across packed bytes to get Hamming distance per vector - distances = bit_counts.sum(axis=1) - - # 5. Get top-k using argpartition (O(N) instead of O(N log N) for full sort) - n_vectors = len(distances) - k = min(top_k, n_vectors) - - if k == n_vectors: - # No partitioning needed, just sort all - sorted_indices = np.argsort(distances) - else: - # Use argpartition for O(N) partial sort - partition_indices = np.argpartition(distances, k)[:k] - # Sort only the top-k - top_k_distances = distances[partition_indices] - sorted_order = np.argsort(top_k_distances) - sorted_indices = partition_indices[sorted_order] - - # 6. Return results - result_ids = self._ids_array[sorted_indices].tolist() - result_dists = distances[sorted_indices].tolist() - - return result_ids, result_dists - - except Exception as e: - raise StorageError(f"Failed to search Binary ANN index: {e}") - - def search_numpy( - self, query: np.ndarray, top_k: int = 10 - ) -> Tuple[List[int], List[int]]: - """Search with unpacked binary query vector. - - Convenience method that packs the query before searching. - - Args: - query: Binary query vector of shape (dim,) with values 0 or 1 - top_k: Number of nearest neighbors to return - - Returns: - Tuple of (ids, distances) - """ - if query.ndim == 2: - query = query.flatten() - - if len(query) != self.dim: - raise ValueError( - f"Query dimension ({len(query)}) must match index dimension ({self.dim})" - ) - - packed_query = np.packbits(query.astype(np.uint8)).tobytes() - return self.search(packed_query, top_k) - - def search_batch( - self, queries: List[bytes], top_k: int = 10 - ) -> List[Tuple[List[int], List[int]]]: - """Batch search for multiple queries. - - Args: - queries: List of packed binary query vectors - top_k: Number of nearest neighbors to return per query - - Returns: - List of (ids, distances) tuples, one per query - """ - results = [] - for query in queries: - ids, dists = self.search(query, top_k) - results.append((ids, dists)) - return results - - def save(self) -> None: - """Save index to disk. - - Binary format: - - 4 bytes: magic number (0x42494E56 = "BINV") - - 4 bytes: version (1) - - 4 bytes: dim - - 4 bytes: packed_dim - - 4 bytes: num_vectors - - For each vector: - - 4 bytes: id - - packed_dim bytes: vector data - - Raises: - StorageError: If save operation fails - """ - with self._lock: - try: - if len(self._vectors) == 0: - logger.debug("Skipping save: index is empty") - return - - # Ensure parent directory exists - self.binary_path.parent.mkdir(parents=True, exist_ok=True) - - with open(self.binary_path, "wb") as f: - # Header - f.write(b"BINV") # Magic number - f.write(np.array([1], dtype=np.uint32).tobytes()) # Version - f.write(np.array([self.dim], dtype=np.uint32).tobytes()) - f.write(np.array([self.packed_dim], dtype=np.uint32).tobytes()) - f.write( - np.array([len(self._vectors)], dtype=np.uint32).tobytes() - ) - - # Vectors - for vec_id in self._id_list: - f.write(np.array([vec_id], dtype=np.uint32).tobytes()) - f.write(self._vectors[vec_id]) - - logger.debug( - f"Saved binary index to {self.binary_path} " - f"({len(self._vectors)} vectors)" - ) - - except Exception as e: - raise StorageError(f"Failed to save Binary ANN index: {e}") - - def load(self) -> bool: - """Load index from disk. - - Returns: - True if index was loaded successfully, False if index file doesn't exist - - Raises: - StorageError: If load operation fails - """ - with self._lock: - try: - if not self.binary_path.exists(): - logger.debug(f"Binary index file not found: {self.binary_path}") - return False - - with open(self.binary_path, "rb") as f: - # Read header - magic = f.read(4) - if magic != b"BINV": - raise StorageError( - f"Invalid binary index file: bad magic number" - ) - - version = np.frombuffer(f.read(4), dtype=np.uint32)[0] - if version != 1: - raise StorageError( - f"Unsupported binary index version: {version}" - ) - - file_dim = np.frombuffer(f.read(4), dtype=np.uint32)[0] - file_packed_dim = np.frombuffer(f.read(4), dtype=np.uint32)[0] - num_vectors = np.frombuffer(f.read(4), dtype=np.uint32)[0] - - if file_dim != self.dim or file_packed_dim != self.packed_dim: - raise StorageError( - f"Dimension mismatch: file has dim={file_dim}, " - f"packed_dim={file_packed_dim}, " - f"expected dim={self.dim}, packed_dim={self.packed_dim}" - ) - - # Clear existing data - self._vectors.clear() - self._id_list.clear() - self._cache_valid = False - - # Read vectors - for _ in range(num_vectors): - vec_id = np.frombuffer(f.read(4), dtype=np.uint32)[0] - vec_data = f.read(self.packed_dim) - self._vectors[int(vec_id)] = vec_data - self._id_list.append(int(vec_id)) - - logger.info( - f"Loaded binary index from {self.binary_path} " - f"({len(self._vectors)} vectors)" - ) - - return True - - except StorageError: - raise - except Exception as e: - raise StorageError(f"Failed to load Binary ANN index: {e}") - - def count(self) -> int: - """Get number of vectors in the index. - - Returns: - Number of vectors currently in the index - """ - with self._lock: - return len(self._vectors) - - @property - def is_loaded(self) -> bool: - """Check if index has vectors. - - Returns: - True if index has vectors, False otherwise - """ - with self._lock: - return len(self._vectors) > 0 - - def get_vector(self, vec_id: int) -> Optional[bytes]: - """Get a specific vector by ID. - - Args: - vec_id: Vector ID to retrieve - - Returns: - Packed binary vector or None if not found - """ - with self._lock: - return self._vectors.get(vec_id) - - def clear(self) -> None: - """Clear all vectors from the index.""" - with self._lock: - self._vectors.clear() - self._id_list.clear() - self._vectors_matrix = None - self._ids_array = None - self._cache_valid = False - logger.debug("Cleared binary index") - - -def create_ann_index( - index_path: Path, - index_type: str = "hnsw", - dim: int = 2048, - **kwargs, -) -> ANNIndex | BinaryANNIndex: - """Factory function to create an ANN index. - - Args: - index_path: Path to database file - index_type: Type of index - "hnsw" for dense vectors, "binary" for binary vectors - dim: Vector dimension (default: 2048 for dense, 256 for binary) - **kwargs: Additional arguments passed to the index constructor - - Returns: - ANNIndex for dense vectors or BinaryANNIndex for binary vectors - - Raises: - ValueError: If index_type is invalid - - Example: - >>> # Dense vector index (HNSW) - >>> dense_index = create_ann_index(path, index_type="hnsw", dim=2048) - >>> dense_index.add_vectors(ids, dense_vectors) - >>> - >>> # Binary vector index (Hamming distance) - >>> binary_index = create_ann_index(path, index_type="binary", dim=256) - >>> binary_index.add_vectors(ids, packed_vectors) - """ - index_type = index_type.lower() - - if index_type == "hnsw": - return ANNIndex(index_path=index_path, dim=dim, **kwargs) - elif index_type == "binary": - # Default to 256 for binary if not specified - if dim == 2048: # Default dense dim was used - dim = 256 - return BinaryANNIndex(index_path=index_path, dim=dim, **kwargs) - else: - raise ValueError( - f"Invalid index_type: {index_type}. Must be 'hnsw' or 'binary'." - ) diff --git a/codex-lens/src/codexlens/semantic/base.py b/codex-lens/src/codexlens/semantic/base.py deleted file mode 100644 index bf8109a0..00000000 --- a/codex-lens/src/codexlens/semantic/base.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Base class for embedders. - -Defines the interface that all embedders must implement. -""" - -from __future__ import annotations - -from abc import ABC, abstractmethod -from typing import Iterable - -import numpy as np - - -class BaseEmbedder(ABC): - """Base class for all embedders. - - All embedder implementations must inherit from this class and implement - the abstract methods to ensure a consistent interface. - """ - - @property - @abstractmethod - def embedding_dim(self) -> int: - """Return embedding dimensions. - - Returns: - int: Dimension of the embedding vectors. - """ - ... - - @property - @abstractmethod - def model_name(self) -> str: - """Return model name. - - Returns: - str: Name or identifier of the underlying model. - """ - ... - - @property - def max_tokens(self) -> int: - """Return maximum token limit for embeddings. - - Returns: - int: Maximum number of tokens that can be embedded at once. - Default is 8192 if not overridden by implementation. - """ - return 8192 - - @abstractmethod - def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray: - """Embed texts to numpy array. - - Args: - texts: Single text or iterable of texts to embed. - - Returns: - numpy.ndarray: Array of shape (n_texts, embedding_dim) containing embeddings. - """ - ... diff --git a/codex-lens/src/codexlens/semantic/chunker.py b/codex-lens/src/codexlens/semantic/chunker.py deleted file mode 100644 index 05d3eb50..00000000 --- a/codex-lens/src/codexlens/semantic/chunker.py +++ /dev/null @@ -1,821 +0,0 @@ -"""Code chunking strategies for semantic search. - -This module provides various chunking strategies for breaking down source code -into semantic chunks suitable for embedding and search. - -Lightweight Mode: - The ChunkConfig supports a `skip_token_count` option for performance optimization. - When enabled, token counting uses a fast character-based estimation (char/4) - instead of expensive tiktoken encoding. - - Use cases for lightweight mode: - - Large-scale indexing where speed is critical - - Scenarios where approximate token counts are acceptable - - Memory-constrained environments - - Initial prototyping and development - - Example: - # Default mode (accurate tiktoken encoding) - config = ChunkConfig() - chunker = Chunker(config) - - # Lightweight mode (fast char/4 estimation) - config = ChunkConfig(skip_token_count=True) - chunker = Chunker(config) - chunks = chunker.chunk_file(content, symbols, path, language) -""" - -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path -from typing import List, Optional, Tuple - -from codexlens.entities import SemanticChunk, Symbol -from codexlens.parsers.tokenizer import get_default_tokenizer - - -@dataclass -class ChunkConfig: - """Configuration for chunking strategies.""" - max_chunk_size: int = 1000 # Max characters per chunk - overlap: int = 200 # Overlap for sliding window (increased from 100 for better context) - strategy: str = "auto" # Chunking strategy: auto, symbol, sliding_window, hybrid - min_chunk_size: int = 50 # Minimum chunk size - skip_token_count: bool = False # Skip expensive token counting (use char/4 estimate) - strip_comments: bool = True # Remove comments from chunk content for embedding - strip_docstrings: bool = True # Remove docstrings from chunk content for embedding - preserve_original: bool = True # Store original content in metadata when stripping - - -class CommentStripper: - """Remove comments from source code while preserving structure.""" - - @staticmethod - def strip_python_comments(content: str) -> str: - """Strip Python comments (# style) but preserve docstrings. - - Args: - content: Python source code - - Returns: - Code with comments removed - """ - lines = content.splitlines(keepends=True) - result_lines: List[str] = [] - in_string = False - string_char = None - - for line in lines: - new_line = [] - i = 0 - while i < len(line): - char = line[i] - - # Handle string literals - if char in ('"', "'") and not in_string: - # Check for triple quotes - if line[i:i+3] in ('"""', "'''"): - in_string = True - string_char = line[i:i+3] - new_line.append(line[i:i+3]) - i += 3 - continue - else: - in_string = True - string_char = char - elif in_string: - if string_char and len(string_char) == 3: - if line[i:i+3] == string_char: - in_string = False - new_line.append(line[i:i+3]) - i += 3 - string_char = None - continue - elif char == string_char: - # Check for escape - if i > 0 and line[i-1] != '\\': - in_string = False - string_char = None - - # Handle comments (only outside strings) - if char == '#' and not in_string: - # Rest of line is comment, skip it - new_line.append('\n' if line.endswith('\n') else '') - break - - new_line.append(char) - i += 1 - - result_lines.append(''.join(new_line)) - - return ''.join(result_lines) - - @staticmethod - def strip_c_style_comments(content: str) -> str: - """Strip C-style comments (// and /* */) from code. - - Args: - content: Source code with C-style comments - - Returns: - Code with comments removed - """ - result = [] - i = 0 - in_string = False - string_char = None - in_multiline_comment = False - - while i < len(content): - # Handle multi-line comment end - if in_multiline_comment: - if content[i:i+2] == '*/': - in_multiline_comment = False - i += 2 - continue - i += 1 - continue - - char = content[i] - - # Handle string literals - if char in ('"', "'", '`') and not in_string: - in_string = True - string_char = char - result.append(char) - i += 1 - continue - elif in_string: - result.append(char) - if char == string_char and (i == 0 or content[i-1] != '\\'): - in_string = False - string_char = None - i += 1 - continue - - # Handle comments - if content[i:i+2] == '//': - # Single line comment - skip to end of line - while i < len(content) and content[i] != '\n': - i += 1 - if i < len(content): - result.append('\n') - i += 1 - continue - - if content[i:i+2] == '/*': - in_multiline_comment = True - i += 2 - continue - - result.append(char) - i += 1 - - return ''.join(result) - - @classmethod - def strip_comments(cls, content: str, language: str) -> str: - """Strip comments based on language. - - Args: - content: Source code content - language: Programming language - - Returns: - Code with comments removed - """ - if language == "python": - return cls.strip_python_comments(content) - elif language in {"javascript", "typescript", "java", "c", "cpp", "go", "rust"}: - return cls.strip_c_style_comments(content) - return content - - -class DocstringStripper: - """Remove docstrings from source code.""" - - @staticmethod - def strip_python_docstrings(content: str) -> str: - """Strip Python docstrings (triple-quoted strings at module/class/function level). - - Args: - content: Python source code - - Returns: - Code with docstrings removed - """ - lines = content.splitlines(keepends=True) - result_lines: List[str] = [] - i = 0 - - while i < len(lines): - line = lines[i] - stripped = line.strip() - - # Check for docstring start - if stripped.startswith('"""') or stripped.startswith("'''"): - quote_type = '"""' if stripped.startswith('"""') else "'''" - - # Single line docstring - if stripped.count(quote_type) >= 2: - # Skip this line (docstring) - i += 1 - continue - - # Multi-line docstring - skip until closing - i += 1 - while i < len(lines): - if quote_type in lines[i]: - i += 1 - break - i += 1 - continue - - result_lines.append(line) - i += 1 - - return ''.join(result_lines) - - @staticmethod - def strip_jsdoc_comments(content: str) -> str: - """Strip JSDoc comments (/** ... */) from code. - - Args: - content: JavaScript/TypeScript source code - - Returns: - Code with JSDoc comments removed - """ - result = [] - i = 0 - in_jsdoc = False - - while i < len(content): - if in_jsdoc: - if content[i:i+2] == '*/': - in_jsdoc = False - i += 2 - continue - i += 1 - continue - - # Check for JSDoc start (/** but not /*) - if content[i:i+3] == '/**': - in_jsdoc = True - i += 3 - continue - - result.append(content[i]) - i += 1 - - return ''.join(result) - - @classmethod - def strip_docstrings(cls, content: str, language: str) -> str: - """Strip docstrings based on language. - - Args: - content: Source code content - language: Programming language - - Returns: - Code with docstrings removed - """ - if language == "python": - return cls.strip_python_docstrings(content) - elif language in {"javascript", "typescript"}: - return cls.strip_jsdoc_comments(content) - return content - - -class Chunker: - """Chunk code files for semantic embedding.""" - - def __init__(self, config: ChunkConfig | None = None) -> None: - self.config = config or ChunkConfig() - self._tokenizer = get_default_tokenizer() - self._comment_stripper = CommentStripper() - self._docstring_stripper = DocstringStripper() - - def _process_content(self, content: str, language: str) -> Tuple[str, Optional[str]]: - """Process chunk content by stripping comments/docstrings if configured. - - Args: - content: Original chunk content - language: Programming language - - Returns: - Tuple of (processed_content, original_content_if_preserved) - """ - original = content if self.config.preserve_original else None - processed = content - - if self.config.strip_comments: - processed = self._comment_stripper.strip_comments(processed, language) - - if self.config.strip_docstrings: - processed = self._docstring_stripper.strip_docstrings(processed, language) - - # If nothing changed, don't store original - if processed == content: - original = None - - return processed, original - - def _estimate_token_count(self, text: str) -> int: - """Estimate token count based on config. - - If skip_token_count is True, uses character-based estimation (char/4). - Otherwise, uses accurate tiktoken encoding. - - Args: - text: Text to count tokens for - - Returns: - Estimated token count - """ - if self.config.skip_token_count: - # Fast character-based estimation: ~4 chars per token - return max(1, len(text) // 4) - return self._tokenizer.count_tokens(text) - - def chunk_by_symbol( - self, - content: str, - symbols: List[Symbol], - file_path: str | Path, - language: str, - symbol_token_counts: Optional[dict[str, int]] = None, - ) -> List[SemanticChunk]: - """Chunk code by extracted symbols (functions, classes). - - Each symbol becomes one chunk with its full content. - Large symbols exceeding max_chunk_size are recursively split using sliding window. - - Args: - content: Source code content - symbols: List of extracted symbols - file_path: Path to source file - language: Programming language - symbol_token_counts: Optional dict mapping symbol names to token counts - """ - chunks: List[SemanticChunk] = [] - lines = content.splitlines(keepends=True) - - for symbol in symbols: - start_line, end_line = symbol.range - # Convert to 0-indexed - start_idx = max(0, start_line - 1) - end_idx = min(len(lines), end_line) - - chunk_content = "".join(lines[start_idx:end_idx]) - if len(chunk_content.strip()) < self.config.min_chunk_size: - continue - - # Check if symbol content exceeds max_chunk_size - if len(chunk_content) > self.config.max_chunk_size: - # Create line mapping for correct line number tracking - line_mapping = list(range(start_line, end_line + 1)) - - # Use sliding window to split large symbol - sub_chunks = self.chunk_sliding_window( - chunk_content, - file_path=file_path, - language=language, - line_mapping=line_mapping - ) - - # Update sub_chunks with parent symbol metadata - for sub_chunk in sub_chunks: - sub_chunk.metadata["symbol_name"] = symbol.name - sub_chunk.metadata["symbol_kind"] = symbol.kind - sub_chunk.metadata["strategy"] = "symbol_split" - sub_chunk.metadata["chunk_type"] = "code" - sub_chunk.metadata["parent_symbol_range"] = (start_line, end_line) - - chunks.extend(sub_chunks) - else: - # Process content (strip comments/docstrings if configured) - processed_content, original_content = self._process_content(chunk_content, language) - - # Skip if processed content is too small - if len(processed_content.strip()) < self.config.min_chunk_size: - continue - - # Calculate token count if not provided - token_count = None - if symbol_token_counts and symbol.name in symbol_token_counts: - token_count = symbol_token_counts[symbol.name] - else: - token_count = self._estimate_token_count(processed_content) - - metadata = { - "file": str(file_path), - "language": language, - "symbol_name": symbol.name, - "symbol_kind": symbol.kind, - "start_line": start_line, - "end_line": end_line, - "strategy": "symbol", - "chunk_type": "code", - "token_count": token_count, - } - - # Store original content if it was modified - if original_content is not None: - metadata["original_content"] = original_content - - chunks.append(SemanticChunk( - content=processed_content, - embedding=None, - metadata=metadata - )) - - return chunks - - def chunk_sliding_window( - self, - content: str, - file_path: str | Path, - language: str, - line_mapping: Optional[List[int]] = None, - ) -> List[SemanticChunk]: - """Chunk code using sliding window approach. - - Used for files without clear symbol boundaries or very long functions. - - Args: - content: Source code content - file_path: Path to source file - language: Programming language - line_mapping: Optional list mapping content line indices to original line numbers - (1-indexed). If provided, line_mapping[i] is the original line number - for the i-th line in content. - """ - chunks: List[SemanticChunk] = [] - lines = content.splitlines(keepends=True) - - if not lines: - return chunks - - # Calculate lines per chunk based on average line length - avg_line_len = len(content) / max(len(lines), 1) - lines_per_chunk = max(10, int(self.config.max_chunk_size / max(avg_line_len, 1))) - overlap_lines = max(2, int(self.config.overlap / max(avg_line_len, 1))) - # Ensure overlap is less than chunk size to prevent infinite loop - overlap_lines = min(overlap_lines, lines_per_chunk - 1) - - start = 0 - chunk_idx = 0 - - while start < len(lines): - end = min(start + lines_per_chunk, len(lines)) - chunk_content = "".join(lines[start:end]) - - if len(chunk_content.strip()) >= self.config.min_chunk_size: - # Process content (strip comments/docstrings if configured) - processed_content, original_content = self._process_content(chunk_content, language) - - # Skip if processed content is too small - if len(processed_content.strip()) < self.config.min_chunk_size: - # Move window forward - step = lines_per_chunk - overlap_lines - if step <= 0: - step = 1 - start += step - continue - - token_count = self._estimate_token_count(processed_content) - - # Calculate correct line numbers - if line_mapping: - # Use line mapping to get original line numbers - start_line = line_mapping[start] - end_line = line_mapping[end - 1] - else: - # Default behavior: treat content as starting at line 1 - start_line = start + 1 - end_line = end - - metadata = { - "file": str(file_path), - "language": language, - "chunk_index": chunk_idx, - "start_line": start_line, - "end_line": end_line, - "strategy": "sliding_window", - "chunk_type": "code", - "token_count": token_count, - } - - # Store original content if it was modified - if original_content is not None: - metadata["original_content"] = original_content - - chunks.append(SemanticChunk( - content=processed_content, - embedding=None, - metadata=metadata - )) - chunk_idx += 1 - - # Move window, accounting for overlap - step = lines_per_chunk - overlap_lines - if step <= 0: - step = 1 # Failsafe to prevent infinite loop - start += step - - # Break if we've reached the end - if end >= len(lines): - break - - return chunks - - def chunk_file( - self, - content: str, - symbols: List[Symbol], - file_path: str | Path, - language: str, - symbol_token_counts: Optional[dict[str, int]] = None, - ) -> List[SemanticChunk]: - """Chunk a file using the best strategy. - - Uses symbol-based chunking if symbols available, - falls back to sliding window for files without symbols. - - Args: - content: Source code content - symbols: List of extracted symbols - file_path: Path to source file - language: Programming language - symbol_token_counts: Optional dict mapping symbol names to token counts - """ - if symbols: - return self.chunk_by_symbol(content, symbols, file_path, language, symbol_token_counts) - return self.chunk_sliding_window(content, file_path, language) - -class DocstringExtractor: - """Extract docstrings from source code.""" - - @staticmethod - def extract_python_docstrings(content: str) -> List[Tuple[str, int, int]]: - """Extract Python docstrings with their line ranges. - - Returns: List of (docstring_content, start_line, end_line) tuples - """ - docstrings: List[Tuple[str, int, int]] = [] - lines = content.splitlines(keepends=True) - - i = 0 - while i < len(lines): - line = lines[i] - stripped = line.strip() - if stripped.startswith('"""') or stripped.startswith("'''"): - quote_type = '"""' if stripped.startswith('"""') else "'''" - start_line = i + 1 - - if stripped.count(quote_type) >= 2: - docstring_content = line - end_line = i + 1 - docstrings.append((docstring_content, start_line, end_line)) - i += 1 - continue - - docstring_lines = [line] - i += 1 - while i < len(lines): - docstring_lines.append(lines[i]) - if quote_type in lines[i]: - break - i += 1 - - end_line = i + 1 - docstring_content = "".join(docstring_lines) - docstrings.append((docstring_content, start_line, end_line)) - - i += 1 - - return docstrings - - @staticmethod - def extract_jsdoc_comments(content: str) -> List[Tuple[str, int, int]]: - """Extract JSDoc comments with their line ranges. - - Returns: List of (comment_content, start_line, end_line) tuples - """ - comments: List[Tuple[str, int, int]] = [] - lines = content.splitlines(keepends=True) - - i = 0 - while i < len(lines): - line = lines[i] - stripped = line.strip() - - if stripped.startswith('/**'): - start_line = i + 1 - comment_lines = [line] - i += 1 - - while i < len(lines): - comment_lines.append(lines[i]) - if '*/' in lines[i]: - break - i += 1 - - end_line = i + 1 - comment_content = "".join(comment_lines) - comments.append((comment_content, start_line, end_line)) - - i += 1 - - return comments - - @classmethod - def extract_docstrings( - cls, - content: str, - language: str - ) -> List[Tuple[str, int, int]]: - """Extract docstrings based on language. - - Returns: List of (docstring_content, start_line, end_line) tuples - """ - if language == "python": - return cls.extract_python_docstrings(content) - elif language in {"javascript", "typescript"}: - return cls.extract_jsdoc_comments(content) - return [] - - -class HybridChunker: - """Hybrid chunker that prioritizes docstrings before symbol-based chunking. - - Composition-based strategy that: - 1. Extracts docstrings as dedicated chunks - 2. For remaining code, uses base chunker (symbol or sliding window) - """ - - def __init__( - self, - base_chunker: Chunker | None = None, - config: ChunkConfig | None = None - ) -> None: - """Initialize hybrid chunker. - - Args: - base_chunker: Chunker to use for non-docstring content - config: Configuration for chunking - """ - self.config = config or ChunkConfig() - self.base_chunker = base_chunker or Chunker(self.config) - self.docstring_extractor = DocstringExtractor() - - def _get_excluded_line_ranges( - self, - docstrings: List[Tuple[str, int, int]] - ) -> set[int]: - """Get set of line numbers that are part of docstrings.""" - excluded_lines: set[int] = set() - for _, start_line, end_line in docstrings: - for line_num in range(start_line, end_line + 1): - excluded_lines.add(line_num) - return excluded_lines - - def _filter_symbols_outside_docstrings( - self, - symbols: List[Symbol], - excluded_lines: set[int] - ) -> List[Symbol]: - """Filter symbols to exclude those completely within docstrings.""" - filtered: List[Symbol] = [] - for symbol in symbols: - start_line, end_line = symbol.range - symbol_lines = set(range(start_line, end_line + 1)) - if not symbol_lines.issubset(excluded_lines): - filtered.append(symbol) - return filtered - - def _find_parent_symbol( - self, - start_line: int, - end_line: int, - symbols: List[Symbol], - ) -> Optional[Symbol]: - """Find the smallest symbol range that fully contains a docstring span.""" - candidates: List[Symbol] = [] - for symbol in symbols: - sym_start, sym_end = symbol.range - if sym_start <= start_line and end_line <= sym_end: - candidates.append(symbol) - if not candidates: - return None - return min(candidates, key=lambda s: (s.range[1] - s.range[0], s.range[0])) - - def chunk_file( - self, - content: str, - symbols: List[Symbol], - file_path: str | Path, - language: str, - symbol_token_counts: Optional[dict[str, int]] = None, - ) -> List[SemanticChunk]: - """Chunk file using hybrid strategy. - - Extracts docstrings first, then chunks remaining code. - - Args: - content: Source code content - symbols: List of extracted symbols - file_path: Path to source file - language: Programming language - symbol_token_counts: Optional dict mapping symbol names to token counts - """ - chunks: List[SemanticChunk] = [] - - # Step 1: Extract docstrings as dedicated chunks - docstrings: List[Tuple[str, int, int]] = [] - if language == "python": - # Fast path: avoid expensive docstring extraction if delimiters are absent. - if '"""' in content or "'''" in content: - docstrings = self.docstring_extractor.extract_docstrings(content, language) - elif language in {"javascript", "typescript"}: - if "/**" in content: - docstrings = self.docstring_extractor.extract_docstrings(content, language) - else: - docstrings = self.docstring_extractor.extract_docstrings(content, language) - - # Fast path: no docstrings -> delegate to base chunker directly. - if not docstrings: - if symbols: - base_chunks = self.base_chunker.chunk_by_symbol( - content, symbols, file_path, language, symbol_token_counts - ) - else: - base_chunks = self.base_chunker.chunk_sliding_window(content, file_path, language) - - for chunk in base_chunks: - chunk.metadata["strategy"] = "hybrid" - chunk.metadata["chunk_type"] = "code" - return base_chunks - - for docstring_content, start_line, end_line in docstrings: - if len(docstring_content.strip()) >= self.config.min_chunk_size: - parent_symbol = self._find_parent_symbol(start_line, end_line, symbols) - # Use base chunker's token estimation method - token_count = self.base_chunker._estimate_token_count(docstring_content) - metadata = { - "file": str(file_path), - "language": language, - "chunk_type": "docstring", - "start_line": start_line, - "end_line": end_line, - "strategy": "hybrid", - "token_count": token_count, - } - if parent_symbol is not None: - metadata["parent_symbol"] = parent_symbol.name - metadata["parent_symbol_kind"] = parent_symbol.kind - metadata["parent_symbol_range"] = parent_symbol.range - chunks.append(SemanticChunk( - content=docstring_content, - embedding=None, - metadata=metadata - )) - - # Step 2: Get line ranges occupied by docstrings - excluded_lines = self._get_excluded_line_ranges(docstrings) - - # Step 3: Filter symbols to exclude docstring-only ranges - filtered_symbols = self._filter_symbols_outside_docstrings(symbols, excluded_lines) - - # Step 4: Chunk remaining content using base chunker - if filtered_symbols: - base_chunks = self.base_chunker.chunk_by_symbol( - content, filtered_symbols, file_path, language, symbol_token_counts - ) - for chunk in base_chunks: - chunk.metadata["strategy"] = "hybrid" - chunk.metadata["chunk_type"] = "code" - chunks.append(chunk) - else: - lines = content.splitlines(keepends=True) - remaining_lines: List[str] = [] - - for i, line in enumerate(lines, start=1): - if i not in excluded_lines: - remaining_lines.append(line) - - if remaining_lines: - remaining_content = "".join(remaining_lines) - if len(remaining_content.strip()) >= self.config.min_chunk_size: - base_chunks = self.base_chunker.chunk_sliding_window( - remaining_content, file_path, language - ) - for chunk in base_chunks: - chunk.metadata["strategy"] = "hybrid" - chunk.metadata["chunk_type"] = "code" - chunks.append(chunk) - - return chunks diff --git a/codex-lens/src/codexlens/semantic/code_extractor.py b/codex-lens/src/codexlens/semantic/code_extractor.py deleted file mode 100644 index ec5b7211..00000000 --- a/codex-lens/src/codexlens/semantic/code_extractor.py +++ /dev/null @@ -1,274 +0,0 @@ -"""Smart code extraction for complete code blocks.""" - -from __future__ import annotations - -from pathlib import Path -from typing import List, Optional, Tuple - -from codexlens.entities import SearchResult, Symbol - - -def extract_complete_code_block( - result: SearchResult, - source_file_path: Optional[str] = None, - context_lines: int = 0, -) -> str: - """Extract complete code block from a search result. - - Args: - result: SearchResult from semantic search. - source_file_path: Optional path to source file for re-reading. - context_lines: Additional lines of context to include above/below. - - Returns: - Complete code block as string. - """ - # If we have full content stored, use it - if result.content: - if context_lines == 0: - return result.content - # Need to add context, read from file - - # Try to read from source file - file_path = source_file_path or result.path - if not file_path or not Path(file_path).exists(): - # Fall back to excerpt - return result.excerpt or "" - - try: - content = Path(file_path).read_text(encoding="utf-8", errors="ignore") - lines = content.splitlines() - - # Get line range - start_line = result.start_line or 1 - end_line = result.end_line or len(lines) - - # Add context - start_idx = max(0, start_line - 1 - context_lines) - end_idx = min(len(lines), end_line + context_lines) - - return "\n".join(lines[start_idx:end_idx]) - except Exception: - return result.excerpt or result.content or "" - - -def extract_symbol_with_context( - file_path: str, - symbol: Symbol, - include_docstring: bool = True, - include_decorators: bool = True, -) -> str: - """Extract a symbol (function/class) with its docstring and decorators. - - Args: - file_path: Path to source file. - symbol: Symbol to extract. - include_docstring: Include docstring if present. - include_decorators: Include decorators/annotations above symbol. - - Returns: - Complete symbol code with context. - """ - try: - content = Path(file_path).read_text(encoding="utf-8", errors="ignore") - lines = content.splitlines() - - start_line, end_line = symbol.range - start_idx = start_line - 1 - end_idx = end_line - - # Look for decorators above the symbol - if include_decorators and start_idx > 0: - decorator_start = start_idx - # Search backwards for decorators - i = start_idx - 1 - while i >= 0 and i >= start_idx - 20: # Look up to 20 lines back - line = lines[i].strip() - if line.startswith("@"): - decorator_start = i - i -= 1 - elif line == "" or line.startswith("#"): - # Skip empty lines and comments, continue looking - i -= 1 - elif line.startswith("//") or line.startswith("/*") or line.startswith("*"): - # JavaScript/Java style comments - decorator_start = i - i -= 1 - else: - # Found non-decorator, non-comment line, stop - break - start_idx = decorator_start - - return "\n".join(lines[start_idx:end_idx]) - except Exception: - return "" - - -def format_search_result_code( - result: SearchResult, - max_lines: Optional[int] = None, - show_line_numbers: bool = True, - highlight_match: bool = False, -) -> str: - """Format search result code for display. - - Args: - result: SearchResult to format. - max_lines: Maximum lines to show (None for all). - show_line_numbers: Include line numbers in output. - highlight_match: Add markers for matched region. - - Returns: - Formatted code string. - """ - content = result.content or result.excerpt or "" - if not content: - return "" - - lines = content.splitlines() - - # Truncate if needed - truncated = False - if max_lines and len(lines) > max_lines: - lines = lines[:max_lines] - truncated = True - - # Format with line numbers - if show_line_numbers: - start = result.start_line or 1 - formatted_lines = [] - for i, line in enumerate(lines): - line_num = start + i - formatted_lines.append(f"{line_num:4d} | {line}") - output = "\n".join(formatted_lines) - else: - output = "\n".join(lines) - - if truncated: - output += "\n... (truncated)" - - return output - - -def get_code_block_summary(result: SearchResult) -> str: - """Get a concise summary of a code block. - - Args: - result: SearchResult to summarize. - - Returns: - Summary string like "function hello_world (lines 10-25)" - """ - parts = [] - - if result.symbol_kind: - parts.append(result.symbol_kind) - - if result.symbol_name: - parts.append(f"`{result.symbol_name}`") - elif result.excerpt: - # Extract first meaningful identifier - first_line = result.excerpt.split("\n")[0][:50] - parts.append(f'"{first_line}..."') - - if result.start_line and result.end_line: - if result.start_line == result.end_line: - parts.append(f"(line {result.start_line})") - else: - parts.append(f"(lines {result.start_line}-{result.end_line})") - - if result.path: - file_name = Path(result.path).name - parts.append(f"in {file_name}") - - return " ".join(parts) if parts else "unknown code block" - - -class CodeBlockResult: - """Enhanced search result with complete code block.""" - - def __init__(self, result: SearchResult, source_path: Optional[str] = None): - self.result = result - self.source_path = source_path or result.path - self._full_code: Optional[str] = None - - @property - def score(self) -> float: - return self.result.score - - @property - def path(self) -> str: - return self.result.path - - @property - def file_name(self) -> str: - return Path(self.result.path).name - - @property - def symbol_name(self) -> Optional[str]: - return self.result.symbol_name - - @property - def symbol_kind(self) -> Optional[str]: - return self.result.symbol_kind - - @property - def line_range(self) -> Tuple[int, int]: - return ( - self.result.start_line or 1, - self.result.end_line or 1 - ) - - @property - def full_code(self) -> str: - """Get full code block content.""" - if self._full_code is None: - self._full_code = extract_complete_code_block(self.result, self.source_path) - return self._full_code - - @property - def excerpt(self) -> str: - """Get short excerpt.""" - return self.result.excerpt or "" - - @property - def summary(self) -> str: - """Get code block summary.""" - return get_code_block_summary(self.result) - - def format( - self, - max_lines: Optional[int] = None, - show_line_numbers: bool = True, - ) -> str: - """Format code for display.""" - # Use full code if available - display_result = SearchResult( - path=self.result.path, - score=self.result.score, - content=self.full_code, - start_line=self.result.start_line, - end_line=self.result.end_line, - ) - return format_search_result_code( - display_result, - max_lines=max_lines, - show_line_numbers=show_line_numbers - ) - - def __repr__(self) -> str: - return f"" - - -def enhance_search_results( - results: List[SearchResult], -) -> List[CodeBlockResult]: - """Enhance search results with complete code block access. - - Args: - results: List of SearchResult from semantic search. - - Returns: - List of CodeBlockResult with full code access. - """ - return [CodeBlockResult(r) for r in results] diff --git a/codex-lens/src/codexlens/semantic/embedder.py b/codex-lens/src/codexlens/semantic/embedder.py deleted file mode 100644 index e2d21717..00000000 --- a/codex-lens/src/codexlens/semantic/embedder.py +++ /dev/null @@ -1,288 +0,0 @@ -"""Embedder for semantic code search using fastembed. - -Supports GPU acceleration via ONNX execution providers (CUDA, TensorRT, DirectML, ROCm, CoreML). -GPU acceleration is automatic when available, with transparent CPU fallback. -""" - -from __future__ import annotations - -import gc -import logging -import threading -from typing import Dict, Iterable, List, Optional - -import numpy as np - -from . import SEMANTIC_AVAILABLE -from .base import BaseEmbedder -from .gpu_support import get_optimal_providers, is_gpu_available, get_gpu_summary, get_selected_device_id - -logger = logging.getLogger(__name__) - -# Global embedder cache for singleton pattern -_embedder_cache: Dict[str, "Embedder"] = {} -_cache_lock = threading.RLock() - - -def get_embedder(profile: str = "code", use_gpu: bool = True) -> "Embedder": - """Get or create a cached Embedder instance (thread-safe singleton). - - This function provides significant performance improvement by reusing - Embedder instances across multiple searches, avoiding repeated model - loading overhead (~0.8s per load). - - Args: - profile: Model profile ("fast", "code", "multilingual", "balanced") - use_gpu: If True, use GPU acceleration when available (default: True) - - Returns: - Cached Embedder instance for the given profile - """ - global _embedder_cache - - # Cache key includes GPU preference to support mixed configurations - cache_key = f"{profile}:{'gpu' if use_gpu else 'cpu'}" - - # All cache access is protected by _cache_lock to avoid races with - # clear_embedder_cache() during concurrent access. - with _cache_lock: - embedder = _embedder_cache.get(cache_key) - if embedder is not None: - return embedder - - # Create new embedder and cache it - embedder = Embedder(profile=profile, use_gpu=use_gpu) - # Pre-load model to ensure it's ready - embedder._load_model() - _embedder_cache[cache_key] = embedder - - # Log GPU status on first embedder creation - if use_gpu and is_gpu_available(): - logger.info(f"Embedder initialized with GPU: {get_gpu_summary()}") - elif use_gpu: - logger.debug("GPU not available, using CPU for embeddings") - - return embedder - - -def clear_embedder_cache() -> None: - """Clear the embedder cache and release ONNX resources. - - This method ensures proper cleanup of ONNX model resources to prevent - memory leaks when embedders are no longer needed. - """ - global _embedder_cache - with _cache_lock: - # Release ONNX resources before clearing cache - for embedder in _embedder_cache.values(): - if embedder._model is not None: - del embedder._model - embedder._model = None - _embedder_cache.clear() - gc.collect() - - -class Embedder(BaseEmbedder): - """Generate embeddings for code chunks using fastembed (ONNX-based). - - Supported Model Profiles: - - fast: BAAI/bge-small-en-v1.5 (384 dim) - Fast, lightweight, English-optimized - - code: jinaai/jina-embeddings-v2-base-code (768 dim) - Code-optimized, best for programming languages - - multilingual: intfloat/multilingual-e5-large (1024 dim) - Multilingual + code support - - balanced: mixedbread-ai/mxbai-embed-large-v1 (1024 dim) - High accuracy, general purpose - """ - - # Model profiles for different use cases - MODELS = { - "fast": "BAAI/bge-small-en-v1.5", # 384 dim - Fast, lightweight - "code": "jinaai/jina-embeddings-v2-base-code", # 768 dim - Code-optimized - "multilingual": "intfloat/multilingual-e5-large", # 1024 dim - Multilingual - "balanced": "mixedbread-ai/mxbai-embed-large-v1", # 1024 dim - High accuracy - } - - # Dimension mapping for each model - MODEL_DIMS = { - "BAAI/bge-small-en-v1.5": 384, - "jinaai/jina-embeddings-v2-base-code": 768, - "intfloat/multilingual-e5-large": 1024, - "mixedbread-ai/mxbai-embed-large-v1": 1024, - } - - # Default model (fast profile) - DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" - DEFAULT_PROFILE = "fast" - - def __init__( - self, - model_name: str | None = None, - profile: str | None = None, - use_gpu: bool = True, - providers: List[str] | None = None, - ) -> None: - """Initialize embedder with model or profile. - - Args: - model_name: Explicit model name (e.g., "jinaai/jina-embeddings-v2-base-code") - profile: Model profile shortcut ("fast", "code", "multilingual", "balanced") - If both provided, model_name takes precedence. - use_gpu: If True, use GPU acceleration when available (default: True) - providers: Explicit ONNX providers list (overrides use_gpu if provided) - """ - if not SEMANTIC_AVAILABLE: - raise ImportError( - "Semantic search dependencies not available. " - "Install with: pip install codexlens[semantic]" - ) - - # Resolve model name from profile or use explicit name - if model_name: - self._model_name = model_name - elif profile and profile in self.MODELS: - self._model_name = self.MODELS[profile] - else: - self._model_name = self.DEFAULT_MODEL - - # Configure ONNX execution providers with device_id options for GPU selection - # Using with_device_options=True ensures DirectML/CUDA device_id is passed correctly - if providers is not None: - self._providers = providers - else: - self._providers = get_optimal_providers(use_gpu=use_gpu, with_device_options=True) - - self._use_gpu = use_gpu - self._model = None - - @property - def model_name(self) -> str: - """Get model name.""" - return self._model_name - - @property - def embedding_dim(self) -> int: - """Get embedding dimension for current model.""" - return self.MODEL_DIMS.get(self._model_name, 768) # Default to 768 if unknown - - @property - def max_tokens(self) -> int: - """Get maximum token limit for current model. - - Returns: - int: Maximum number of tokens based on model profile. - - fast: 512 (lightweight, optimized for speed) - - code: 8192 (code-optimized, larger context) - - multilingual: 512 (standard multilingual model) - - balanced: 512 (general purpose) - """ - # Determine profile from model name - profile = None - for prof, model in self.MODELS.items(): - if model == self._model_name: - profile = prof - break - - # Return token limit based on profile - if profile == "code": - return 8192 - elif profile in ("fast", "multilingual", "balanced"): - return 512 - else: - # Default for unknown models - return 512 - - @property - def providers(self) -> List[str]: - """Get configured ONNX execution providers.""" - return self._providers - - @property - def is_gpu_enabled(self) -> bool: - """Check if GPU acceleration is enabled for this embedder.""" - gpu_providers = {"CUDAExecutionProvider", "TensorrtExecutionProvider", - "DmlExecutionProvider", "ROCMExecutionProvider", "CoreMLExecutionProvider"} - # Handle both string providers and tuple providers (name, options) - for p in self._providers: - provider_name = p[0] if isinstance(p, tuple) else p - if provider_name in gpu_providers: - return True - return False - - def _load_model(self) -> None: - """Lazy load the embedding model with configured providers.""" - if self._model is not None: - return - - from fastembed import TextEmbedding - - # providers already include device_id options via get_optimal_providers(with_device_options=True) - # DO NOT pass device_ids separately - fastembed ignores it when providers is specified - # See: fastembed/text/onnx_embedding.py - device_ids is only used with cuda=True - try: - self._model = TextEmbedding( - model_name=self.model_name, - providers=self._providers, - ) - logger.debug(f"Model loaded with providers: {self._providers}") - except TypeError: - # Fallback for older fastembed versions without providers parameter - logger.warning( - "fastembed version doesn't support 'providers' parameter. " - "Upgrade fastembed for GPU acceleration: pip install --upgrade fastembed" - ) - self._model = TextEmbedding(model_name=self.model_name) - - def embed(self, texts: str | Iterable[str]) -> List[List[float]]: - """Generate embeddings for one or more texts. - - Args: - texts: Single text or iterable of texts to embed. - - Returns: - List of embedding vectors (each is a list of floats). - - Note: - This method converts numpy arrays to Python lists for backward compatibility. - For memory-efficient processing, use embed_to_numpy() instead. - """ - self._load_model() - - if isinstance(texts, str): - texts = [texts] - else: - texts = list(texts) - - embeddings = list(self._model.embed(texts)) - return [emb.tolist() for emb in embeddings] - - def embed_to_numpy(self, texts: str | Iterable[str], batch_size: Optional[int] = None) -> np.ndarray: - """Generate embeddings for one or more texts (returns numpy arrays). - - This method is more memory-efficient than embed() as it avoids converting - numpy arrays to Python lists, which can significantly reduce memory usage - during batch processing. - - Args: - texts: Single text or iterable of texts to embed. - batch_size: Optional batch size for fastembed processing. - Larger values improve GPU utilization but use more memory. - - Returns: - numpy.ndarray of shape (n_texts, embedding_dim) containing embeddings. - """ - self._load_model() - - if isinstance(texts, str): - texts = [texts] - else: - texts = list(texts) - - # Pass batch_size to fastembed for optimal GPU utilization - # Default batch_size in fastembed is 256, but larger values can improve throughput - if batch_size is not None: - embeddings = list(self._model.embed(texts, batch_size=batch_size)) - else: - embeddings = list(self._model.embed(texts)) - return np.array(embeddings) - - def embed_single(self, text: str) -> List[float]: - """Generate embedding for a single text.""" - return self.embed(text)[0] diff --git a/codex-lens/src/codexlens/semantic/factory.py b/codex-lens/src/codexlens/semantic/factory.py deleted file mode 100644 index 3295eba8..00000000 --- a/codex-lens/src/codexlens/semantic/factory.py +++ /dev/null @@ -1,158 +0,0 @@ -"""Factory for creating embedders. - -Provides a unified interface for instantiating different embedder backends. -Includes caching to avoid repeated model loading overhead. -""" - -from __future__ import annotations - -import logging -import threading -from typing import Any, Dict, List, Optional - -from .base import BaseEmbedder - -# Module-level cache for embedder instances -# Key: (backend, profile, model, use_gpu) -> embedder instance -_embedder_cache: Dict[tuple, BaseEmbedder] = {} -_cache_lock = threading.Lock() -_logger = logging.getLogger(__name__) - - -def get_embedder( - backend: str = "fastembed", - profile: str = "code", - model: str = "default", - use_gpu: bool = True, - endpoints: Optional[List[Dict[str, Any]]] = None, - strategy: str = "latency_aware", - cooldown: float = 60.0, - **kwargs: Any, -) -> BaseEmbedder: - """Factory function to create embedder based on backend. - - Args: - backend: Embedder backend to use. Options: - - "fastembed": Use fastembed (ONNX-based) embedder (default) - - "litellm": Use ccw-litellm embedder - profile: Model profile for fastembed backend ("fast", "code", "multilingual", "balanced") - Used only when backend="fastembed". Default: "code" - model: Model identifier for litellm backend. - Used only when backend="litellm". Default: "default" - use_gpu: Whether to use GPU acceleration when available (default: True). - Used only when backend="fastembed". - endpoints: Optional list of endpoint configurations for multi-endpoint load balancing. - Each endpoint is a dict with keys: model, api_key, api_base, weight. - Used only when backend="litellm" and multiple endpoints provided. - strategy: Selection strategy for multi-endpoint mode: - "round_robin", "latency_aware", "weighted_random". - Default: "latency_aware" - cooldown: Default cooldown seconds for rate-limited endpoints (default: 60.0) - **kwargs: Additional backend-specific arguments - - Returns: - BaseEmbedder: Configured embedder instance - - Raises: - ValueError: If backend is not recognized - ImportError: If required backend dependencies are not installed - - Examples: - Create fastembed embedder with code profile: - >>> embedder = get_embedder(backend="fastembed", profile="code") - - Create fastembed embedder with fast profile and CPU only: - >>> embedder = get_embedder(backend="fastembed", profile="fast", use_gpu=False) - - Create litellm embedder: - >>> embedder = get_embedder(backend="litellm", model="text-embedding-3-small") - - Create rotational embedder with multiple endpoints: - >>> endpoints = [ - ... {"model": "openai/text-embedding-3-small", "api_key": "sk-..."}, - ... {"model": "azure/my-embedding", "api_base": "https://...", "api_key": "..."}, - ... ] - >>> embedder = get_embedder(backend="litellm", endpoints=endpoints) - """ - # Build cache key from immutable configuration - if backend == "fastembed": - cache_key = ("fastembed", profile, None, use_gpu) - elif backend == "litellm": - # For litellm, use model as part of cache key - # Multi-endpoint mode is not cached as it's more complex - if endpoints and len(endpoints) > 1: - cache_key = None # Skip cache for multi-endpoint - else: - effective_model = endpoints[0]["model"] if endpoints else model - cache_key = ("litellm", None, effective_model, None) - else: - cache_key = None - - # Check cache first (thread-safe) - if cache_key is not None: - with _cache_lock: - if cache_key in _embedder_cache: - _logger.debug("Returning cached embedder for %s", cache_key) - return _embedder_cache[cache_key] - - # Create new embedder instance - embedder: Optional[BaseEmbedder] = None - - if backend == "fastembed": - from .embedder import Embedder - embedder = Embedder(profile=profile, use_gpu=use_gpu, **kwargs) - elif backend == "litellm": - # Check if multi-endpoint mode is requested - if endpoints and len(endpoints) > 1: - from .rotational_embedder import create_rotational_embedder - # Multi-endpoint is not cached - return create_rotational_embedder( - endpoints_config=endpoints, - strategy=strategy, - default_cooldown=cooldown, - ) - elif endpoints and len(endpoints) == 1: - # Single endpoint in list - use it directly - ep = endpoints[0] - ep_kwargs = {**kwargs} - if "api_key" in ep: - ep_kwargs["api_key"] = ep["api_key"] - if "api_base" in ep: - ep_kwargs["api_base"] = ep["api_base"] - from .litellm_embedder import LiteLLMEmbedderWrapper - embedder = LiteLLMEmbedderWrapper(model=ep["model"], **ep_kwargs) - else: - # No endpoints list - use model parameter - from .litellm_embedder import LiteLLMEmbedderWrapper - embedder = LiteLLMEmbedderWrapper(model=model, **kwargs) - else: - raise ValueError( - f"Unknown backend: {backend}. " - f"Supported backends: 'fastembed', 'litellm'" - ) - - # Cache the embedder for future use (thread-safe) - if cache_key is not None and embedder is not None: - with _cache_lock: - # Double-check to avoid race condition - if cache_key not in _embedder_cache: - _embedder_cache[cache_key] = embedder - _logger.debug("Cached new embedder for %s", cache_key) - else: - # Another thread created it already, use that one - embedder = _embedder_cache[cache_key] - - return embedder # type: ignore - - -def clear_embedder_cache() -> int: - """Clear the embedder cache. - - Returns: - Number of embedders cleared from cache - """ - with _cache_lock: - count = len(_embedder_cache) - _embedder_cache.clear() - _logger.debug("Cleared %d embedders from cache", count) - return count diff --git a/codex-lens/src/codexlens/semantic/gpu_support.py b/codex-lens/src/codexlens/semantic/gpu_support.py deleted file mode 100644 index 62a5186d..00000000 --- a/codex-lens/src/codexlens/semantic/gpu_support.py +++ /dev/null @@ -1,431 +0,0 @@ -"""GPU acceleration support for semantic embeddings. - -This module provides GPU detection, initialization, and fallback handling -for ONNX-based embedding generation. -""" - -from __future__ import annotations - -import logging -from dataclasses import dataclass -from typing import List, Optional - -logger = logging.getLogger(__name__) - - -@dataclass -class GPUDevice: - """Individual GPU device info.""" - device_id: int - name: str - is_discrete: bool # True for discrete GPU (NVIDIA, AMD), False for integrated (Intel UHD) - vendor: str # "nvidia", "amd", "intel", "unknown" - - -@dataclass -class GPUInfo: - """GPU availability and configuration info.""" - - gpu_available: bool = False - cuda_available: bool = False - gpu_count: int = 0 - gpu_name: Optional[str] = None - onnx_providers: List[str] = None - devices: List[GPUDevice] = None # List of detected GPU devices - preferred_device_id: Optional[int] = None # Preferred GPU for embedding - - def __post_init__(self): - if self.onnx_providers is None: - self.onnx_providers = ["CPUExecutionProvider"] - if self.devices is None: - self.devices = [] - - -_gpu_info_cache: Optional[GPUInfo] = None - - -def _enumerate_gpus() -> List[GPUDevice]: - """Enumerate available GPU devices using WMI on Windows. - - Returns: - List of GPUDevice with device info, ordered by device_id. - """ - devices = [] - - try: - import subprocess - import sys - - if sys.platform == "win32": - # Use PowerShell to query GPU information via WMI - cmd = [ - "powershell", "-NoProfile", "-Command", - "Get-WmiObject Win32_VideoController | Select-Object DeviceID, Name, AdapterCompatibility | ConvertTo-Json" - ] - result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) - - if result.returncode == 0 and result.stdout.strip(): - import json - gpu_data = json.loads(result.stdout) - - # Handle single GPU case (returns dict instead of list) - if isinstance(gpu_data, dict): - gpu_data = [gpu_data] - - for idx, gpu in enumerate(gpu_data): - name = gpu.get("Name", "Unknown GPU") - compat = gpu.get("AdapterCompatibility", "").lower() - - # Determine vendor - name_lower = name.lower() - if "nvidia" in name_lower or "nvidia" in compat: - vendor = "nvidia" - is_discrete = True - elif "amd" in name_lower or "radeon" in name_lower or "amd" in compat: - vendor = "amd" - is_discrete = True - elif "intel" in name_lower or "intel" in compat: - vendor = "intel" - # Intel UHD/Iris are integrated, Intel Arc is discrete - is_discrete = "arc" in name_lower - else: - vendor = "unknown" - is_discrete = False - - devices.append(GPUDevice( - device_id=idx, - name=name, - is_discrete=is_discrete, - vendor=vendor - )) - logger.debug(f"Detected GPU {idx}: {name} (vendor={vendor}, discrete={is_discrete})") - - except Exception as e: - logger.debug(f"GPU enumeration failed: {e}") - - return devices - - -def _get_preferred_device_id(devices: List[GPUDevice]) -> Optional[int]: - """Determine the preferred GPU device_id for embedding. - - Preference order: - 1. NVIDIA discrete GPU (best DirectML/CUDA support) - 2. AMD discrete GPU - 3. Intel Arc (discrete) - 4. Intel integrated (fallback) - - Returns: - device_id of preferred GPU, or None to use default. - """ - if not devices: - return None - - # Priority: NVIDIA > AMD > Intel Arc > Intel integrated - priority_order = [ - ("nvidia", True), # NVIDIA discrete - ("amd", True), # AMD discrete - ("intel", True), # Intel Arc (discrete) - ("intel", False), # Intel integrated (fallback) - ] - - for target_vendor, target_discrete in priority_order: - for device in devices: - if device.vendor == target_vendor and device.is_discrete == target_discrete: - logger.info(f"Preferred GPU: {device.name} (device_id={device.device_id})") - return device.device_id - - # If no match, use first device - if devices: - return devices[0].device_id - - return None - - -def detect_gpu(force_refresh: bool = False) -> GPUInfo: - """Detect available GPU resources for embedding acceleration. - - Args: - force_refresh: If True, re-detect GPU even if cached. - - Returns: - GPUInfo with detection results. - """ - global _gpu_info_cache - - if _gpu_info_cache is not None and not force_refresh: - return _gpu_info_cache - - info = GPUInfo() - - # Enumerate GPU devices first - info.devices = _enumerate_gpus() - info.gpu_count = len(info.devices) - if info.devices: - # Set preferred device (discrete GPU preferred over integrated) - info.preferred_device_id = _get_preferred_device_id(info.devices) - # Set gpu_name to preferred device name - for dev in info.devices: - if dev.device_id == info.preferred_device_id: - info.gpu_name = dev.name - break - - # Check PyTorch CUDA availability (most reliable detection) - try: - import torch - if torch.cuda.is_available(): - info.cuda_available = True - info.gpu_available = True - info.gpu_count = torch.cuda.device_count() - if info.gpu_count > 0: - info.gpu_name = torch.cuda.get_device_name(0) - logger.debug(f"PyTorch CUDA detected: {info.gpu_count} GPU(s)") - except ImportError: - logger.debug("PyTorch not available for GPU detection") - - # Check ONNX Runtime providers with validation - try: - import onnxruntime as ort - available_providers = ort.get_available_providers() - - # Build provider list with priority order - providers = [] - - # Test each provider to ensure it actually works - def test_provider(provider_name: str) -> bool: - """Test if a provider actually works by creating a dummy session.""" - try: - # Create a minimal ONNX model to test provider - import numpy as np - # Simple test: just check if provider can be instantiated - sess_options = ort.SessionOptions() - sess_options.log_severity_level = 4 # Suppress warnings - return True - except Exception: - return False - - # CUDA provider (NVIDIA GPU) - check if CUDA runtime is available - if "CUDAExecutionProvider" in available_providers: - # Verify CUDA is actually usable by checking for cuBLAS - cuda_works = False - try: - import ctypes - # Try to load cuBLAS to verify CUDA installation - try: - ctypes.CDLL("cublas64_12.dll") - cuda_works = True - except OSError: - try: - ctypes.CDLL("cublas64_11.dll") - cuda_works = True - except OSError: - pass - except Exception: - pass - - if cuda_works: - providers.append("CUDAExecutionProvider") - info.gpu_available = True - logger.debug("ONNX CUDAExecutionProvider available and working") - else: - logger.debug("ONNX CUDAExecutionProvider listed but CUDA runtime not found") - - # TensorRT provider (optimized NVIDIA inference) - if "TensorrtExecutionProvider" in available_providers: - # TensorRT requires additional libraries, skip for now - logger.debug("ONNX TensorrtExecutionProvider available (requires TensorRT SDK)") - - # DirectML provider (Windows GPU - AMD/Intel/NVIDIA) - if "DmlExecutionProvider" in available_providers: - providers.append("DmlExecutionProvider") - info.gpu_available = True - logger.debug("ONNX DmlExecutionProvider available (DirectML)") - - # ROCm provider (AMD GPU on Linux) - if "ROCMExecutionProvider" in available_providers: - providers.append("ROCMExecutionProvider") - info.gpu_available = True - logger.debug("ONNX ROCMExecutionProvider available (AMD)") - - # CoreML provider (Apple Silicon) - if "CoreMLExecutionProvider" in available_providers: - providers.append("CoreMLExecutionProvider") - info.gpu_available = True - logger.debug("ONNX CoreMLExecutionProvider available (Apple)") - - # Always include CPU as fallback - providers.append("CPUExecutionProvider") - - info.onnx_providers = providers - - except ImportError: - logger.debug("ONNX Runtime not available") - info.onnx_providers = ["CPUExecutionProvider"] - - _gpu_info_cache = info - return info - - -def get_optimal_providers(use_gpu: bool = True, with_device_options: bool = False) -> list: - """Get optimal ONNX execution providers based on availability. - - Args: - use_gpu: If True, include GPU providers when available. - If False, force CPU-only execution. - with_device_options: If True, return providers as tuples with device_id options - for proper GPU device selection (required for DirectML). - - Returns: - List of provider names or tuples (provider_name, options_dict) in priority order. - """ - if not use_gpu: - return ["CPUExecutionProvider"] - - gpu_info = detect_gpu() - - # Check if GPU was requested but not available - log warning - if not gpu_info.gpu_available: - try: - import onnxruntime as ort - available_providers = ort.get_available_providers() - except ImportError: - available_providers = [] - logger.warning( - "GPU acceleration was requested, but no supported GPU provider (CUDA, DirectML) " - f"was found. Available providers: {available_providers}. Falling back to CPU." - ) - else: - # Log which GPU provider is being used - gpu_providers = [p for p in gpu_info.onnx_providers if p != "CPUExecutionProvider"] - if gpu_providers: - logger.info(f"Using {gpu_providers[0]} for ONNX GPU acceleration") - - if not with_device_options: - return gpu_info.onnx_providers - - # Build providers with device_id options for GPU providers - device_id = get_selected_device_id() - providers = [] - - for provider in gpu_info.onnx_providers: - if provider == "DmlExecutionProvider" and device_id is not None: - # DirectML requires device_id in provider_options tuple - providers.append(("DmlExecutionProvider", {"device_id": device_id})) - logger.debug(f"DmlExecutionProvider configured with device_id={device_id}") - elif provider == "CUDAExecutionProvider" and device_id is not None: - # CUDA also supports device_id in provider_options - providers.append(("CUDAExecutionProvider", {"device_id": device_id})) - logger.debug(f"CUDAExecutionProvider configured with device_id={device_id}") - elif provider == "ROCMExecutionProvider" and device_id is not None: - # ROCm supports device_id - providers.append(("ROCMExecutionProvider", {"device_id": device_id})) - logger.debug(f"ROCMExecutionProvider configured with device_id={device_id}") - else: - # CPU and other providers don't need device_id - providers.append(provider) - - return providers - - -def is_gpu_available() -> bool: - """Check if any GPU acceleration is available.""" - return detect_gpu().gpu_available - - -def get_gpu_summary() -> str: - """Get human-readable GPU status summary.""" - info = detect_gpu() - - if not info.gpu_available: - return "GPU: Not available (using CPU)" - - parts = [] - if info.gpu_name: - parts.append(f"GPU: {info.gpu_name}") - if info.gpu_count > 1: - parts.append(f"({info.gpu_count} devices)") - - # Show active providers (excluding CPU fallback) - gpu_providers = [p for p in info.onnx_providers if p != "CPUExecutionProvider"] - if gpu_providers: - parts.append(f"Providers: {', '.join(gpu_providers)}") - - return " | ".join(parts) if parts else "GPU: Available" - - -def clear_gpu_cache() -> None: - """Clear cached GPU detection info.""" - global _gpu_info_cache - _gpu_info_cache = None - - -# User-selected device ID (overrides auto-detection) -_selected_device_id: Optional[int] = None - - -def get_gpu_devices() -> List[dict]: - """Get list of available GPU devices for frontend selection. - - Returns: - List of dicts with device info for each GPU. - """ - info = detect_gpu() - devices = [] - - for dev in info.devices: - devices.append({ - "device_id": dev.device_id, - "name": dev.name, - "vendor": dev.vendor, - "is_discrete": dev.is_discrete, - "is_preferred": dev.device_id == info.preferred_device_id, - "is_selected": dev.device_id == get_selected_device_id(), - }) - - return devices - - -def get_selected_device_id() -> Optional[int]: - """Get the user-selected GPU device_id. - - Returns: - User-selected device_id, or auto-detected preferred device_id if not set. - """ - global _selected_device_id - - if _selected_device_id is not None: - return _selected_device_id - - # Fall back to auto-detected preferred device - info = detect_gpu() - return info.preferred_device_id - - -def set_selected_device_id(device_id: Optional[int]) -> bool: - """Set the GPU device_id to use for embeddings. - - Args: - device_id: GPU device_id to use, or None to use auto-detection. - - Returns: - True if device_id is valid, False otherwise. - """ - global _selected_device_id - - if device_id is None: - _selected_device_id = None - logger.info("GPU selection reset to auto-detection") - return True - - # Validate device_id exists - info = detect_gpu() - valid_ids = [dev.device_id for dev in info.devices] - - if device_id in valid_ids: - _selected_device_id = device_id - device_name = next((dev.name for dev in info.devices if dev.device_id == device_id), "Unknown") - logger.info(f"GPU selection set to device {device_id}: {device_name}") - return True - else: - logger.warning(f"Invalid device_id {device_id}. Valid IDs: {valid_ids}") - return False diff --git a/codex-lens/src/codexlens/semantic/litellm_embedder.py b/codex-lens/src/codexlens/semantic/litellm_embedder.py deleted file mode 100644 index ee4284dd..00000000 --- a/codex-lens/src/codexlens/semantic/litellm_embedder.py +++ /dev/null @@ -1,144 +0,0 @@ -"""LiteLLM embedder wrapper for CodexLens. - -Provides integration with ccw-litellm's LiteLLMEmbedder for embedding generation. -""" - -from __future__ import annotations - -from typing import Iterable - -import numpy as np - -from .base import BaseEmbedder - - -class LiteLLMEmbedderWrapper(BaseEmbedder): - """Wrapper for ccw-litellm LiteLLMEmbedder. - - This wrapper adapts the ccw-litellm LiteLLMEmbedder to the CodexLens - BaseEmbedder interface, enabling seamless integration with CodexLens - semantic search functionality. - - Args: - model: Model identifier for LiteLLM (default: "default") - **kwargs: Additional arguments passed to LiteLLMEmbedder - - Raises: - ImportError: If ccw-litellm package is not installed - """ - - def __init__(self, model: str = "default", **kwargs) -> None: - """Initialize LiteLLM embedder wrapper. - - Args: - model: Model identifier for LiteLLM (default: "default") - **kwargs: Additional arguments passed to LiteLLMEmbedder - - Raises: - ImportError: If ccw-litellm package is not installed - """ - try: - from ccw_litellm import LiteLLMEmbedder - self._embedder = LiteLLMEmbedder(model=model, **kwargs) - except ImportError as e: - raise ImportError( - "ccw-litellm not installed. Install with: pip install ccw-litellm" - ) from e - - @property - def embedding_dim(self) -> int: - """Return embedding dimensions from LiteLLMEmbedder. - - Returns: - int: Dimension of the embedding vectors. - """ - return self._embedder.dimensions - - @property - def model_name(self) -> str: - """Return model name from LiteLLMEmbedder. - - Returns: - str: Name or identifier of the underlying model. - """ - return self._embedder.model_name - - @property - def max_tokens(self) -> int: - """Return maximum token limit for the embedding model. - - Returns: - int: Maximum number of tokens that can be embedded at once. - Reads from LiteLLM config's max_input_tokens property. - """ - # Get from LiteLLM embedder's max_input_tokens property (now exposed) - if hasattr(self._embedder, 'max_input_tokens'): - return self._embedder.max_input_tokens - - # Fallback: infer from model name - model_name_lower = self.model_name.lower() - - # Large models (8B or "large" in name) - if '8b' in model_name_lower or 'large' in model_name_lower: - return 32768 - - # OpenAI text-embedding-3-* models - if 'text-embedding-3' in model_name_lower: - return 8191 - - # Default fallback - return 8192 - - def _sanitize_text(self, text: str) -> str: - """Sanitize text to work around ModelScope API routing bug. - - ModelScope incorrectly routes text starting with lowercase 'import' - to an Ollama endpoint, causing failures. This adds a leading space - to work around the issue without affecting embedding quality. - - Args: - text: Text to sanitize. - - Returns: - Sanitized text safe for embedding API. - """ - if text.startswith('import'): - return ' ' + text - return text - - def embed_to_numpy(self, texts: str | Iterable[str], **kwargs) -> np.ndarray: - """Embed texts to numpy array using LiteLLMEmbedder. - - Args: - texts: Single text or iterable of texts to embed. - **kwargs: Additional arguments (ignored for LiteLLM backend). - Accepts batch_size for API compatibility with fastembed. - - Returns: - numpy.ndarray: Array of shape (n_texts, embedding_dim) containing embeddings. - """ - if isinstance(texts, str): - texts = [texts] - else: - texts = list(texts) - - # Sanitize texts to avoid ModelScope routing bug - texts = [self._sanitize_text(t) for t in texts] - - # LiteLLM handles batching internally, ignore batch_size parameter - return self._embedder.embed(texts) - - def embed_single(self, text: str) -> list[float]: - """Generate embedding for a single text. - - Args: - text: Text to embed. - - Returns: - list[float]: Embedding vector as a list of floats. - """ - # Sanitize text before embedding - sanitized = self._sanitize_text(text) - embedding = self._embedder.embed([sanitized]) - return embedding[0].tolist() - diff --git a/codex-lens/src/codexlens/semantic/reranker/__init__.py b/codex-lens/src/codexlens/semantic/reranker/__init__.py deleted file mode 100644 index e52b0223..00000000 --- a/codex-lens/src/codexlens/semantic/reranker/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Reranker backends for second-stage search ranking. - -This subpackage provides a unified interface and factory for different reranking -implementations (e.g., ONNX, API-based, LiteLLM, and legacy sentence-transformers). -""" - -from __future__ import annotations - -from .base import BaseReranker -from .factory import check_reranker_available, get_reranker -from .fastembed_reranker import FastEmbedReranker, check_fastembed_reranker_available -from .legacy import CrossEncoderReranker, check_cross_encoder_available -from .onnx_reranker import ONNXReranker, check_onnx_reranker_available - -__all__ = [ - "BaseReranker", - "check_reranker_available", - "get_reranker", - "CrossEncoderReranker", - "check_cross_encoder_available", - "FastEmbedReranker", - "check_fastembed_reranker_available", - "ONNXReranker", - "check_onnx_reranker_available", -] diff --git a/codex-lens/src/codexlens/semantic/reranker/api_reranker.py b/codex-lens/src/codexlens/semantic/reranker/api_reranker.py deleted file mode 100644 index d4dcc968..00000000 --- a/codex-lens/src/codexlens/semantic/reranker/api_reranker.py +++ /dev/null @@ -1,442 +0,0 @@ -"""API-based reranker using a remote HTTP provider. - -Supported providers: -- SiliconFlow: https://api.siliconflow.cn/v1/rerank -- Cohere: https://api.cohere.ai/v1/rerank -- Jina: https://api.jina.ai/v1/rerank -""" - -from __future__ import annotations - -import logging -import os -import random -import time -from pathlib import Path -from typing import Any, Mapping, Sequence - -from .base import BaseReranker - -logger = logging.getLogger(__name__) - -_DEFAULT_ENV_API_KEY = "RERANKER_API_KEY" - - -def _normalize_api_base_for_endpoint(*, api_base: str, endpoint: str) -> str: - """Normalize api_base to avoid duplicated version paths (e.g. /v1/v1/...). - - httpx joins base_url paths with request paths even when the request path - starts with a leading slash. This means: - - base_url="https://host/v1" + endpoint="/v1/rerank" - -> "https://host/v1/v1/rerank" - - Many users configure OpenAI-style bases with a trailing "/v1", so we - defensively strip that suffix when the endpoint already includes "/v1/". - """ - cleaned = (api_base or "").strip().rstrip("/") - if not cleaned: - return cleaned - - endpoint_clean = endpoint or "" - - # If api_base already includes the endpoint suffix (e.g. api_base ends with "/v1/rerank"), - # strip it so we don't end up with ".../v1/rerank/v1/rerank". - if endpoint_clean.startswith("/") and cleaned.lower().endswith(endpoint_clean.lower()): - return cleaned[: -len(endpoint_clean)] - - # Strip a trailing "/v1" if endpoint already includes "/v1/...". - if endpoint_clean.startswith("/v1/") and cleaned.lower().endswith("/v1"): - return cleaned[:-3] - - return cleaned - - -def _get_env_with_fallback(key: str, workspace_root: Path | None = None) -> str | None: - """Get environment variable with .env file fallback.""" - # Check os.environ first - if key in os.environ: - return os.environ[key] - prefixed_key = f"CODEXLENS_{key}" - if prefixed_key in os.environ: - return os.environ[prefixed_key] - - # Try loading from .env files - try: - from codexlens.env_config import get_env - value = get_env(key, workspace_root=workspace_root) - if value is not None: - return value - return get_env(prefixed_key, workspace_root=workspace_root) - except ImportError: - return None - - -def check_httpx_available() -> tuple[bool, str | None]: - try: - import httpx # noqa: F401 - except ImportError as exc: # pragma: no cover - optional dependency - return False, f"httpx not available: {exc}. Install with: pip install httpx" - return True, None - - -class APIReranker(BaseReranker): - """Reranker backed by a remote reranking HTTP API.""" - - _PROVIDER_DEFAULTS: Mapping[str, Mapping[str, str]] = { - "siliconflow": { - "api_base": "https://api.siliconflow.cn", - "endpoint": "/v1/rerank", - "default_model": "BAAI/bge-reranker-v2-m3", - }, - "cohere": { - "api_base": "https://api.cohere.ai", - "endpoint": "/v1/rerank", - "default_model": "rerank-english-v3.0", - }, - "jina": { - "api_base": "https://api.jina.ai", - "endpoint": "/v1/rerank", - "default_model": "jina-reranker-v2-base-multilingual", - }, - } - - def __init__( - self, - *, - provider: str = "siliconflow", - model_name: str | None = None, - api_key: str | None = None, - api_base: str | None = None, - timeout: float = 30.0, - max_retries: int = 3, - backoff_base_s: float = 0.5, - backoff_max_s: float = 8.0, - env_api_key: str = _DEFAULT_ENV_API_KEY, - workspace_root: Path | str | None = None, - max_input_tokens: int | None = None, - ) -> None: - ok, err = check_httpx_available() - if not ok: # pragma: no cover - exercised via factory availability tests - raise ImportError(err) - - import httpx - - self._workspace_root = Path(workspace_root) if workspace_root else None - - self.provider = (provider or "").strip().lower() - if self.provider not in self._PROVIDER_DEFAULTS: - raise ValueError( - f"Unknown reranker provider: {provider}. " - f"Supported providers: {', '.join(sorted(self._PROVIDER_DEFAULTS))}" - ) - - defaults = self._PROVIDER_DEFAULTS[self.provider] - - # Load api_base from env with .env fallback - env_api_base = _get_env_with_fallback("RERANKER_API_BASE", self._workspace_root) - self.endpoint = defaults["endpoint"] - self.api_base = _normalize_api_base_for_endpoint( - api_base=(api_base or env_api_base or defaults["api_base"]), - endpoint=self.endpoint, - ) - - # Load model from env with .env fallback - env_model = _get_env_with_fallback("RERANKER_MODEL", self._workspace_root) - self.model_name = (model_name or env_model or defaults["default_model"]).strip() - if not self.model_name: - raise ValueError("model_name cannot be blank") - - # Load API key from env with .env fallback - resolved_key = api_key or _get_env_with_fallback(env_api_key, self._workspace_root) or "" - resolved_key = resolved_key.strip() - if not resolved_key: - raise ValueError( - f"Missing API key for reranker provider '{self.provider}'. " - f"Pass api_key=... or set ${env_api_key}." - ) - self._api_key = resolved_key - - self.timeout_s = float(timeout) if timeout and float(timeout) > 0 else 30.0 - self.max_retries = int(max_retries) if max_retries and int(max_retries) >= 0 else 3 - self.backoff_base_s = float(backoff_base_s) if backoff_base_s and float(backoff_base_s) > 0 else 0.5 - self.backoff_max_s = float(backoff_max_s) if backoff_max_s and float(backoff_max_s) > 0 else 8.0 - - headers = { - "Authorization": f"Bearer {self._api_key}", - "Content-Type": "application/json", - } - if self.provider == "cohere": - headers.setdefault("Cohere-Version", "2022-12-06") - - self._client = httpx.Client( - base_url=self.api_base, - headers=headers, - timeout=self.timeout_s, - ) - - # Store max_input_tokens with model-aware defaults - if max_input_tokens is not None: - self._max_input_tokens = max_input_tokens - else: - # Infer from model name - model_lower = self.model_name.lower() - if '8b' in model_lower or 'large' in model_lower: - self._max_input_tokens = 32768 - else: - self._max_input_tokens = 8192 - - @property - def max_input_tokens(self) -> int: - """Return maximum token limit for reranking.""" - return self._max_input_tokens - - def close(self) -> None: - try: - self._client.close() - except Exception: # pragma: no cover - defensive - return - - def _sleep_backoff(self, attempt: int, *, retry_after_s: float | None = None) -> None: - if retry_after_s is not None and retry_after_s > 0: - time.sleep(min(float(retry_after_s), self.backoff_max_s)) - return - - exp = self.backoff_base_s * (2**attempt) - jitter = random.uniform(0, min(0.5, self.backoff_base_s)) - time.sleep(min(self.backoff_max_s, exp + jitter)) - - @staticmethod - def _parse_retry_after_seconds(headers: Mapping[str, str]) -> float | None: - value = (headers.get("Retry-After") or "").strip() - if not value: - return None - try: - return float(value) - except ValueError: - return None - - @staticmethod - def _should_retry_status(status_code: int) -> bool: - return status_code == 429 or 500 <= status_code <= 599 - - def _request_json(self, payload: Mapping[str, Any]) -> Mapping[str, Any]: - last_exc: Exception | None = None - - for attempt in range(self.max_retries + 1): - try: - response = self._client.post(self.endpoint, json=dict(payload)) - except Exception as exc: # httpx is optional at import-time - last_exc = exc - if attempt < self.max_retries: - self._sleep_backoff(attempt) - continue - raise RuntimeError( - f"Rerank request failed for provider '{self.provider}' after " - f"{self.max_retries + 1} attempts: {type(exc).__name__}: {exc}" - ) from exc - - status = int(getattr(response, "status_code", 0) or 0) - if status >= 400: - body_preview = "" - try: - body_preview = (response.text or "").strip() - except Exception: - body_preview = "" - if len(body_preview) > 300: - body_preview = body_preview[:300] + "…" - - if self._should_retry_status(status) and attempt < self.max_retries: - retry_after = self._parse_retry_after_seconds(response.headers) - logger.warning( - "Rerank request to %s%s failed with HTTP %s (attempt %s/%s). Retrying…", - self.api_base, - self.endpoint, - status, - attempt + 1, - self.max_retries + 1, - ) - self._sleep_backoff(attempt, retry_after_s=retry_after) - continue - - if status in {401, 403}: - raise RuntimeError( - f"Rerank request unauthorized for provider '{self.provider}' (HTTP {status}). " - "Check your API key." - ) - - raise RuntimeError( - f"Rerank request failed for provider '{self.provider}' (HTTP {status}). " - f"Response: {body_preview or ''}" - ) - - try: - data = response.json() - except Exception as exc: - raise RuntimeError( - f"Rerank response from provider '{self.provider}' is not valid JSON: " - f"{type(exc).__name__}: {exc}" - ) from exc - - if not isinstance(data, dict): - raise RuntimeError( - f"Rerank response from provider '{self.provider}' must be a JSON object; " - f"got {type(data).__name__}" - ) - - return data - - raise RuntimeError( - f"Rerank request failed for provider '{self.provider}'. Last error: {last_exc}" - ) - - @staticmethod - def _extract_scores_from_results(results: Any, expected: int) -> list[float]: - if not isinstance(results, list): - raise RuntimeError(f"Invalid rerank response: 'results' must be a list, got {type(results).__name__}") - - scores: list[float] = [0.0 for _ in range(expected)] - filled = 0 - - for item in results: - if not isinstance(item, dict): - continue - idx = item.get("index") - score = item.get("relevance_score", item.get("score")) - if idx is None or score is None: - continue - try: - idx_int = int(idx) - score_f = float(score) - except (TypeError, ValueError): - continue - if 0 <= idx_int < expected: - scores[idx_int] = score_f - filled += 1 - - if filled != expected: - raise RuntimeError( - f"Rerank response contained {filled}/{expected} scored documents; " - "ensure top_n matches the number of documents." - ) - - return scores - - def _build_payload(self, *, query: str, documents: Sequence[str]) -> Mapping[str, Any]: - payload: dict[str, Any] = { - "model": self.model_name, - "query": query, - "documents": list(documents), - "top_n": len(documents), - "return_documents": False, - } - return payload - - def _estimate_tokens(self, text: str) -> int: - """Estimate token count using fast heuristic. - - Uses len(text) // 4 as approximation (~4 chars per token for English). - Not perfectly accurate for all models/languages but sufficient for - batch sizing decisions where exact counts aren't critical. - """ - return len(text) // 4 - - def _create_token_aware_batches( - self, - query: str, - documents: Sequence[str], - ) -> list[list[tuple[int, str]]]: - """Split documents into batches that fit within token limits. - - Uses 90% of max_input_tokens as safety margin. - Each batch includes the query tokens overhead. - """ - max_tokens = int(self._max_input_tokens * 0.9) - query_tokens = self._estimate_tokens(query) - - batches: list[list[tuple[int, str]]] = [] - current_batch: list[tuple[int, str]] = [] - current_tokens = query_tokens # Start with query overhead - - for idx, doc in enumerate(documents): - doc_tokens = self._estimate_tokens(doc) - - # Warn if single document exceeds token limit (will be truncated by API) - if doc_tokens > max_tokens - query_tokens: - logger.warning( - f"Document {idx} exceeds token limit: ~{doc_tokens} tokens " - f"(limit: {max_tokens - query_tokens} after query overhead). " - "Document will likely be truncated by the API." - ) - - # If batch would exceed limit, start new batch - if current_tokens + doc_tokens > max_tokens and current_batch: - batches.append(current_batch) - current_batch = [] - current_tokens = query_tokens - - current_batch.append((idx, doc)) - current_tokens += doc_tokens - - if current_batch: - batches.append(current_batch) - - return batches - - def _rerank_one_query(self, *, query: str, documents: Sequence[str]) -> list[float]: - if not documents: - return [] - - # Create token-aware batches - batches = self._create_token_aware_batches(query, documents) - - if len(batches) == 1: - # Single batch - original behavior - payload = self._build_payload(query=query, documents=documents) - data = self._request_json(payload) - results = data.get("results") - return self._extract_scores_from_results(results, expected=len(documents)) - - # Multiple batches - process each and merge results - logger.info( - f"Splitting {len(documents)} documents into {len(batches)} batches " - f"(max_input_tokens: {self._max_input_tokens})" - ) - - all_scores: list[float] = [0.0] * len(documents) - - for batch in batches: - batch_docs = [doc for _, doc in batch] - payload = self._build_payload(query=query, documents=batch_docs) - data = self._request_json(payload) - results = data.get("results") - batch_scores = self._extract_scores_from_results(results, expected=len(batch_docs)) - - # Map scores back to original indices - for (orig_idx, _), score in zip(batch, batch_scores): - all_scores[orig_idx] = score - - return all_scores - - def score_pairs( - self, - pairs: Sequence[tuple[str, str]], - *, - batch_size: int = 32, # noqa: ARG002 - kept for BaseReranker compatibility - ) -> list[float]: - if not pairs: - return [] - - grouped: dict[str, list[tuple[int, str]]] = {} - for idx, (query, doc) in enumerate(pairs): - grouped.setdefault(str(query), []).append((idx, str(doc))) - - scores: list[float] = [0.0 for _ in range(len(pairs))] - - for query, items in grouped.items(): - documents = [doc for _, doc in items] - query_scores = self._rerank_one_query(query=query, documents=documents) - for (orig_idx, _), score in zip(items, query_scores): - scores[orig_idx] = float(score) - - return scores diff --git a/codex-lens/src/codexlens/semantic/reranker/base.py b/codex-lens/src/codexlens/semantic/reranker/base.py deleted file mode 100644 index 65c2d837..00000000 --- a/codex-lens/src/codexlens/semantic/reranker/base.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Base class for rerankers. - -Defines the interface that all rerankers must implement. -""" - -from __future__ import annotations - -from abc import ABC, abstractmethod -from typing import Sequence - - -class BaseReranker(ABC): - """Base class for all rerankers. - - All reranker implementations must inherit from this class and implement - the abstract methods to ensure a consistent interface. - """ - - @property - def max_input_tokens(self) -> int: - """Return maximum token limit for reranking. - - Returns: - int: Maximum number of tokens that can be processed at once. - Default is 8192 if not overridden by implementation. - """ - return 8192 - - @abstractmethod - def score_pairs( - self, - pairs: Sequence[tuple[str, str]], - *, - batch_size: int = 32, - ) -> list[float]: - """Score (query, doc) pairs. - - Args: - pairs: Sequence of (query, doc) string pairs to score. - batch_size: Batch size for scoring. - - Returns: - List of scores (one per pair). - """ - ... - diff --git a/codex-lens/src/codexlens/semantic/reranker/factory.py b/codex-lens/src/codexlens/semantic/reranker/factory.py deleted file mode 100644 index 459034b5..00000000 --- a/codex-lens/src/codexlens/semantic/reranker/factory.py +++ /dev/null @@ -1,159 +0,0 @@ -"""Factory for creating rerankers. - -Provides a unified interface for instantiating different reranker backends. -""" - -from __future__ import annotations - -from typing import Any - -from .base import BaseReranker - - -def check_reranker_available(backend: str) -> tuple[bool, str | None]: - """Check whether a specific reranker backend can be used. - - Notes: - - "fastembed" uses fastembed TextCrossEncoder (pip install fastembed>=0.4.0). [Recommended] - - "onnx" uses Optimum + ONNX Runtime (pip install onnxruntime optimum[onnxruntime] transformers). - - "legacy" uses sentence-transformers CrossEncoder (pip install codexlens[reranker-legacy]). - - "api" uses a remote reranking HTTP API (requires httpx). - - "litellm" uses `ccw-litellm` for unified access to LLM providers. - """ - backend = (backend or "").strip().lower() - - if backend == "legacy": - from .legacy import check_cross_encoder_available - - return check_cross_encoder_available() - - if backend == "fastembed": - from .fastembed_reranker import check_fastembed_reranker_available - - return check_fastembed_reranker_available() - - if backend == "onnx": - from .onnx_reranker import check_onnx_reranker_available - - return check_onnx_reranker_available() - - if backend == "litellm": - try: - import ccw_litellm # noqa: F401 - except ImportError as exc: # pragma: no cover - optional dependency - return ( - False, - f"ccw-litellm not available: {exc}. Install with: pip install ccw-litellm", - ) - - try: - from .litellm_reranker import LiteLLMReranker # noqa: F401 - except Exception as exc: # pragma: no cover - defensive - return False, f"LiteLLM reranker backend not available: {exc}" - - return True, None - - if backend == "api": - from .api_reranker import check_httpx_available - - return check_httpx_available() - - return False, ( - f"Invalid reranker backend: {backend}. " - "Must be 'fastembed', 'onnx', 'api', 'litellm', or 'legacy'." - ) - - -def get_reranker( - backend: str = "onnx", - model_name: str | None = None, - *, - device: str | None = None, - **kwargs: Any, -) -> BaseReranker: - """Factory function to create reranker based on backend. - - Args: - backend: Reranker backend to use. Options: - - "onnx": Optimum + ONNX Runtime backend (default) - - "fastembed": FastEmbed TextCrossEncoder backend - - "api": HTTP API backend (remote providers) - - "litellm": LiteLLM backend (LLM-based, for API mode) - - "legacy": sentence-transformers CrossEncoder backend (optional) - model_name: Model identifier for model-based backends. Defaults depend on backend: - - onnx: Xenova/ms-marco-MiniLM-L-6-v2 - - fastembed: Xenova/ms-marco-MiniLM-L-6-v2 - - api: BAAI/bge-reranker-v2-m3 (SiliconFlow) - - legacy: cross-encoder/ms-marco-MiniLM-L-6-v2 - - litellm: default - device: Optional device string for backends that support it (legacy and onnx). - **kwargs: Additional backend-specific arguments. - - Returns: - BaseReranker: Configured reranker instance. - - Raises: - ValueError: If backend is not recognized. - ImportError: If required backend dependencies are not installed or backend is unavailable. - """ - backend = (backend or "").strip().lower() - - if backend == "fastembed": - ok, err = check_reranker_available("fastembed") - if not ok: - raise ImportError(err) - - from .fastembed_reranker import FastEmbedReranker - - resolved_model_name = (model_name or "").strip() or FastEmbedReranker.DEFAULT_MODEL - _ = device # Device selection is managed via fastembed providers. - return FastEmbedReranker(model_name=resolved_model_name, **kwargs) - - if backend == "onnx": - ok, err = check_reranker_available("onnx") - if not ok: - raise ImportError(err) - - from .onnx_reranker import ONNXReranker - - resolved_model_name = (model_name or "").strip() or ONNXReranker.DEFAULT_MODEL - effective_kwargs = dict(kwargs) - if "use_gpu" not in effective_kwargs and device is not None: - effective_kwargs["use_gpu"] = str(device).strip().lower() not in {"cpu", "none"} - return ONNXReranker(model_name=resolved_model_name, **effective_kwargs) - - if backend == "legacy": - ok, err = check_reranker_available("legacy") - if not ok: - raise ImportError(err) - - from .legacy import CrossEncoderReranker - - resolved_model_name = (model_name or "").strip() or "cross-encoder/ms-marco-MiniLM-L-6-v2" - return CrossEncoderReranker(model_name=resolved_model_name, device=device) - - if backend == "litellm": - ok, err = check_reranker_available("litellm") - if not ok: - raise ImportError(err) - - from .litellm_reranker import LiteLLMReranker - - _ = device # Device selection is not applicable to remote LLM backends. - resolved_model_name = (model_name or "").strip() or "default" - return LiteLLMReranker(model=resolved_model_name, **kwargs) - - if backend == "api": - ok, err = check_reranker_available("api") - if not ok: - raise ImportError(err) - - from .api_reranker import APIReranker - - _ = device # Device selection is not applicable to remote HTTP backends. - resolved_model_name = (model_name or "").strip() or None - return APIReranker(model_name=resolved_model_name, **kwargs) - - raise ValueError( - f"Unknown backend: {backend}. Supported backends: 'fastembed', 'onnx', 'api', 'litellm', 'legacy'" - ) diff --git a/codex-lens/src/codexlens/semantic/reranker/fastembed_reranker.py b/codex-lens/src/codexlens/semantic/reranker/fastembed_reranker.py deleted file mode 100644 index c38d4aa0..00000000 --- a/codex-lens/src/codexlens/semantic/reranker/fastembed_reranker.py +++ /dev/null @@ -1,257 +0,0 @@ -"""FastEmbed-based reranker backend. - -This reranker uses fastembed's TextCrossEncoder for cross-encoder reranking. -FastEmbed is ONNX-based internally but provides a cleaner, unified API. - -Install: - pip install fastembed>=0.4.0 -""" - -from __future__ import annotations - -import logging -import threading -from typing import Any, Sequence - -from .base import BaseReranker - -logger = logging.getLogger(__name__) - - -def check_fastembed_reranker_available() -> tuple[bool, str | None]: - """Check whether fastembed reranker dependencies are available.""" - try: - import fastembed # noqa: F401 - except ImportError as exc: # pragma: no cover - optional dependency - return ( - False, - f"fastembed not available: {exc}. Install with: pip install fastembed>=0.4.0", - ) - - try: - from fastembed.rerank.cross_encoder import TextCrossEncoder # noqa: F401 - except ImportError as exc: # pragma: no cover - optional dependency - return ( - False, - f"fastembed TextCrossEncoder not available: {exc}. " - "Upgrade with: pip install fastembed>=0.4.0", - ) - - return True, None - - -class FastEmbedReranker(BaseReranker): - """Cross-encoder reranker using fastembed's TextCrossEncoder with lazy loading.""" - - DEFAULT_MODEL = "Xenova/ms-marco-MiniLM-L-6-v2" - - # Alternative models supported by fastembed: - # - "BAAI/bge-reranker-base" - # - "BAAI/bge-reranker-large" - # - "cross-encoder/ms-marco-MiniLM-L-6-v2" - - def __init__( - self, - model_name: str | None = None, - *, - use_gpu: bool = True, - cache_dir: str | None = None, - threads: int | None = None, - ) -> None: - """Initialize FastEmbed reranker. - - Args: - model_name: Model identifier. Defaults to Xenova/ms-marco-MiniLM-L-6-v2. - use_gpu: Whether to use GPU acceleration when available. - cache_dir: Optional directory for caching downloaded models. - threads: Optional number of threads for ONNX Runtime. - """ - self.model_name = (model_name or self.DEFAULT_MODEL).strip() - if not self.model_name: - raise ValueError("model_name cannot be blank") - - self.use_gpu = bool(use_gpu) - self.cache_dir = cache_dir - self.threads = threads - - self._encoder: Any | None = None - self._lock = threading.RLock() - - def _load_model(self) -> None: - """Lazy-load the TextCrossEncoder model.""" - if self._encoder is not None: - return - - ok, err = check_fastembed_reranker_available() - if not ok: - raise ImportError(err) - - with self._lock: - if self._encoder is not None: - return - - from fastembed.rerank.cross_encoder import TextCrossEncoder - - # Determine providers based on GPU preference - providers: list[str] | None = None - if self.use_gpu: - try: - from ..gpu_support import get_optimal_providers - - providers = get_optimal_providers(use_gpu=True, with_device_options=False) - except Exception: - # Fallback: let fastembed decide - providers = None - - # Build initialization kwargs - init_kwargs: dict[str, Any] = {} - if self.cache_dir: - init_kwargs["cache_dir"] = self.cache_dir - if self.threads is not None: - init_kwargs["threads"] = self.threads - if providers: - init_kwargs["providers"] = providers - - logger.debug( - "Loading FastEmbed reranker model: %s (use_gpu=%s)", - self.model_name, - self.use_gpu, - ) - - self._encoder = TextCrossEncoder( - model_name=self.model_name, - **init_kwargs, - ) - - logger.debug("FastEmbed reranker model loaded successfully") - - @staticmethod - def _sigmoid(x: float) -> float: - """Numerically stable sigmoid function.""" - if x < -709: - return 0.0 - if x > 709: - return 1.0 - import math - return 1.0 / (1.0 + math.exp(-x)) - - def score_pairs( - self, - pairs: Sequence[tuple[str, str]], - *, - batch_size: int = 32, - ) -> list[float]: - """Score (query, doc) pairs. - - Args: - pairs: Sequence of (query, doc) string pairs to score. - batch_size: Batch size for scoring. - - Returns: - List of scores (one per pair), normalized to [0, 1] range. - """ - if not pairs: - return [] - - self._load_model() - - if self._encoder is None: # pragma: no cover - defensive - return [] - - # FastEmbed's TextCrossEncoder.rerank() expects a query and list of documents. - # For batch scoring of multiple query-doc pairs, we need to process them. - # Group by query for efficiency when same query appears multiple times. - query_to_docs: dict[str, list[tuple[int, str]]] = {} - for idx, (query, doc) in enumerate(pairs): - if query not in query_to_docs: - query_to_docs[query] = [] - query_to_docs[query].append((idx, doc)) - - # Score each query group - scores: list[float] = [0.0] * len(pairs) - - for query, indexed_docs in query_to_docs.items(): - docs = [doc for _, doc in indexed_docs] - indices = [idx for idx, _ in indexed_docs] - - try: - # TextCrossEncoder.rerank returns raw float scores in same order as input - raw_scores = list( - self._encoder.rerank( - query=query, - documents=docs, - batch_size=batch_size, - ) - ) - - # Map scores back to original positions and normalize with sigmoid - for i, raw_score in enumerate(raw_scores): - if i < len(indices): - original_idx = indices[i] - # Normalize score to [0, 1] using stable sigmoid - scores[original_idx] = self._sigmoid(float(raw_score)) - - except Exception as e: - logger.warning("FastEmbed rerank failed for query: %s", str(e)[:100]) - # Leave scores as 0.0 for failed queries - - return scores - - def rerank( - self, - query: str, - documents: Sequence[str], - *, - top_k: int | None = None, - batch_size: int = 32, - ) -> list[tuple[float, str, int]]: - """Rerank documents for a single query. - - This is a convenience method that provides results in ranked order. - - Args: - query: The query string. - documents: List of documents to rerank. - top_k: Return only top K results. None returns all. - batch_size: Batch size for scoring. - - Returns: - List of (score, document, original_index) tuples, sorted by score descending. - """ - if not documents: - return [] - - self._load_model() - - if self._encoder is None: # pragma: no cover - defensive - return [] - - try: - # TextCrossEncoder.rerank returns raw float scores in same order as input - raw_scores = list( - self._encoder.rerank( - query=query, - documents=list(documents), - batch_size=batch_size, - ) - ) - - # Convert to our format: (normalized_score, document, original_index) - ranked = [] - for idx, raw_score in enumerate(raw_scores): - if idx < len(documents): - # Normalize score to [0, 1] using stable sigmoid - normalized = self._sigmoid(float(raw_score)) - ranked.append((normalized, documents[idx], idx)) - - # Sort by score descending - ranked.sort(key=lambda x: x[0], reverse=True) - - if top_k is not None and top_k > 0: - ranked = ranked[:top_k] - - return ranked - - except Exception as e: - logger.warning("FastEmbed rerank failed: %s", str(e)[:100]) - return [] diff --git a/codex-lens/src/codexlens/semantic/reranker/legacy.py b/codex-lens/src/codexlens/semantic/reranker/legacy.py deleted file mode 100644 index a5ee05de..00000000 --- a/codex-lens/src/codexlens/semantic/reranker/legacy.py +++ /dev/null @@ -1,91 +0,0 @@ -"""Legacy sentence-transformers cross-encoder reranker. - -Install with: pip install codexlens[reranker-legacy] -""" - -from __future__ import annotations - -import logging -import threading -from typing import List, Sequence, Tuple - -from .base import BaseReranker - -logger = logging.getLogger(__name__) - -try: - from sentence_transformers import CrossEncoder as _CrossEncoder - - CROSS_ENCODER_AVAILABLE = True - _import_error: str | None = None -except ImportError as exc: # pragma: no cover - optional dependency - _CrossEncoder = None # type: ignore[assignment] - CROSS_ENCODER_AVAILABLE = False - _import_error = str(exc) - - -def check_cross_encoder_available() -> tuple[bool, str | None]: - if CROSS_ENCODER_AVAILABLE: - return True, None - return ( - False, - _import_error - or "sentence-transformers not available. Install with: pip install codexlens[reranker-legacy]", - ) - - -class CrossEncoderReranker(BaseReranker): - """Cross-encoder reranker with lazy model loading.""" - - def __init__(self, model_name: str, *, device: str | None = None) -> None: - self.model_name = (model_name or "").strip() - if not self.model_name: - raise ValueError("model_name cannot be blank") - - self.device = (device or "").strip() or None - self._model = None - self._lock = threading.RLock() - - def _load_model(self) -> None: - if self._model is not None: - return - - ok, err = check_cross_encoder_available() - if not ok: - raise ImportError(err) - - with self._lock: - if self._model is not None: - return - - try: - if self.device: - self._model = _CrossEncoder(self.model_name, device=self.device) # type: ignore[misc] - else: - self._model = _CrossEncoder(self.model_name) # type: ignore[misc] - except Exception as exc: - logger.debug("Failed to load cross-encoder model %s: %s", self.model_name, exc) - raise - - def score_pairs( - self, - pairs: Sequence[Tuple[str, str]], - *, - batch_size: int = 32, - ) -> List[float]: - """Score (query, doc) pairs using the cross-encoder. - - Returns: - List of scores (one per pair) in the model's native scale (usually logits). - """ - if not pairs: - return [] - - self._load_model() - - if self._model is None: # pragma: no cover - defensive - return [] - - bs = int(batch_size) if batch_size and int(batch_size) > 0 else 32 - scores = self._model.predict(list(pairs), batch_size=bs) # type: ignore[union-attr] - return [float(s) for s in scores] diff --git a/codex-lens/src/codexlens/semantic/reranker/litellm_reranker.py b/codex-lens/src/codexlens/semantic/reranker/litellm_reranker.py deleted file mode 100644 index ec735994..00000000 --- a/codex-lens/src/codexlens/semantic/reranker/litellm_reranker.py +++ /dev/null @@ -1,214 +0,0 @@ -"""Experimental LiteLLM reranker backend. - -This module provides :class:`LiteLLMReranker`, which uses an LLM to score the -relevance of a single (query, document) pair per request. - -Notes: - - This backend is experimental and may be slow/expensive compared to local - rerankers. - - It relies on `ccw-litellm` for a unified LLM API across providers. -""" - -from __future__ import annotations - -import json -import logging -import re -import threading -import time -from typing import Any, Sequence - -from .base import BaseReranker - -logger = logging.getLogger(__name__) - -_NUMBER_RE = re.compile(r"[-+]?\d+(?:\.\d+)?(?:[eE][-+]?\d+)?") - - -def _coerce_score_to_unit_interval(score: float) -> float: - """Coerce a numeric score into [0, 1]. - - The prompt asks for a float in [0, 1], but some models may respond with 0-10 - or 0-100 scales. This function attempts a conservative normalization. - """ - if 0.0 <= score <= 1.0: - return score - if 0.0 <= score <= 10.0: - return score / 10.0 - if 0.0 <= score <= 100.0: - return score / 100.0 - return max(0.0, min(1.0, score)) - - -def _extract_score(text: str) -> float | None: - """Extract a numeric relevance score from an LLM response.""" - content = (text or "").strip() - if not content: - return None - - # Prefer JSON if present. - if "{" in content and "}" in content: - try: - start = content.index("{") - end = content.rindex("}") + 1 - payload = json.loads(content[start:end]) - if isinstance(payload, dict) and "score" in payload: - return float(payload["score"]) - except Exception: - pass - - match = _NUMBER_RE.search(content) - if not match: - return None - try: - return float(match.group(0)) - except ValueError: - return None - - -class LiteLLMReranker(BaseReranker): - """Experimental reranker that uses a LiteLLM-compatible model. - - This reranker scores each (query, doc) pair in isolation (single-pair mode) - to improve prompt reliability across providers. - """ - - _SYSTEM_PROMPT = ( - "You are a relevance scoring assistant.\n" - "Given a search query and a document snippet, output a single numeric " - "relevance score between 0 and 1.\n\n" - "Scoring guidance:\n" - "- 1.0: The document directly answers the query.\n" - "- 0.5: The document is partially relevant.\n" - "- 0.0: The document is unrelated.\n\n" - "Output requirements:\n" - "- Output ONLY the number (e.g., 0.73).\n" - "- Do not include any other text." - ) - - def __init__( - self, - model: str = "default", - *, - requests_per_minute: float | None = None, - min_interval_seconds: float | None = None, - default_score: float = 0.0, - max_doc_chars: int = 8000, - **litellm_kwargs: Any, - ) -> None: - """Initialize the reranker. - - Args: - model: Model name from ccw-litellm configuration (default: "default"). - requests_per_minute: Optional rate limit in requests per minute. - min_interval_seconds: Optional minimum interval between requests. If set, - it takes precedence over requests_per_minute. - default_score: Score to use when an API call fails or parsing fails. - max_doc_chars: Maximum number of document characters to include in the prompt. - **litellm_kwargs: Passed through to `ccw_litellm.LiteLLMClient`. - - Raises: - ImportError: If ccw-litellm is not installed. - ValueError: If model is blank. - """ - self.model_name = (model or "").strip() - if not self.model_name: - raise ValueError("model cannot be blank") - - self.default_score = float(default_score) - - self.max_doc_chars = int(max_doc_chars) if int(max_doc_chars) > 0 else 0 - - if min_interval_seconds is not None: - self._min_interval_seconds = max(0.0, float(min_interval_seconds)) - elif requests_per_minute is not None and float(requests_per_minute) > 0: - self._min_interval_seconds = 60.0 / float(requests_per_minute) - else: - self._min_interval_seconds = 0.0 - - # Prefer deterministic output by default; allow overrides via kwargs. - litellm_kwargs = dict(litellm_kwargs) - litellm_kwargs.setdefault("temperature", 0.0) - litellm_kwargs.setdefault("max_tokens", 16) - - try: - from ccw_litellm import ChatMessage, LiteLLMClient - except ImportError as exc: # pragma: no cover - optional dependency - raise ImportError( - "ccw-litellm not installed. Install with: pip install ccw-litellm" - ) from exc - - self._ChatMessage = ChatMessage - self._client = LiteLLMClient(model=self.model_name, **litellm_kwargs) - - self._lock = threading.RLock() - self._last_request_at = 0.0 - - def _sanitize_text(self, text: str) -> str: - # Keep consistent with LiteLLMEmbedderWrapper workaround. - if text.startswith("import"): - return " " + text - return text - - def _rate_limit(self) -> None: - if self._min_interval_seconds <= 0: - return - with self._lock: - now = time.monotonic() - elapsed = now - self._last_request_at - if elapsed < self._min_interval_seconds: - time.sleep(self._min_interval_seconds - elapsed) - self._last_request_at = time.monotonic() - - def _build_user_prompt(self, query: str, doc: str) -> str: - sanitized_query = self._sanitize_text(query or "") - sanitized_doc = self._sanitize_text(doc or "") - if self.max_doc_chars and len(sanitized_doc) > self.max_doc_chars: - sanitized_doc = sanitized_doc[: self.max_doc_chars] - - return ( - "Query:\n" - f"{sanitized_query}\n\n" - "Document:\n" - f"{sanitized_doc}\n\n" - "Return the relevance score (0 to 1) as a single number:" - ) - - def _score_single_pair(self, query: str, doc: str) -> float: - messages = [ - self._ChatMessage(role="system", content=self._SYSTEM_PROMPT), - self._ChatMessage(role="user", content=self._build_user_prompt(query, doc)), - ] - - try: - self._rate_limit() - response = self._client.chat(messages) - except Exception as exc: - logger.debug("LiteLLM reranker request failed: %s", exc) - return self.default_score - - raw = getattr(response, "content", "") or "" - score = _extract_score(raw) - if score is None: - logger.debug("Failed to parse LiteLLM reranker score from response: %r", raw) - return self.default_score - return _coerce_score_to_unit_interval(float(score)) - - def score_pairs( - self, - pairs: Sequence[tuple[str, str]], - *, - batch_size: int = 32, - ) -> list[float]: - """Score (query, doc) pairs with per-pair LLM calls.""" - if not pairs: - return [] - - bs = int(batch_size) if batch_size and int(batch_size) > 0 else 32 - - scores: list[float] = [] - for i in range(0, len(pairs), bs): - batch = pairs[i : i + bs] - for query, doc in batch: - scores.append(self._score_single_pair(query, doc)) - return scores diff --git a/codex-lens/src/codexlens/semantic/reranker/onnx_reranker.py b/codex-lens/src/codexlens/semantic/reranker/onnx_reranker.py deleted file mode 100644 index a56fb953..00000000 --- a/codex-lens/src/codexlens/semantic/reranker/onnx_reranker.py +++ /dev/null @@ -1,302 +0,0 @@ -"""Optimum + ONNX Runtime reranker backend. - -This reranker uses Hugging Face Optimum's ONNXRuntime backend for sequence -classification models. It is designed to run without requiring PyTorch at -runtime by using numpy tensors and ONNX Runtime execution providers. - -Install (CPU): - pip install onnxruntime optimum[onnxruntime] transformers -""" - -from __future__ import annotations - -import logging -import threading -from typing import Any, Iterable, Sequence - -from .base import BaseReranker - -logger = logging.getLogger(__name__) - - -def check_onnx_reranker_available() -> tuple[bool, str | None]: - """Check whether Optimum + ONNXRuntime reranker dependencies are available.""" - try: - import numpy # noqa: F401 - except ImportError as exc: # pragma: no cover - optional dependency - return False, f"numpy not available: {exc}. Install with: pip install numpy" - - try: - import onnxruntime # noqa: F401 - except ImportError as exc: # pragma: no cover - optional dependency - return ( - False, - f"onnxruntime not available: {exc}. Install with: pip install onnxruntime", - ) - - try: - from optimum.onnxruntime import ORTModelForSequenceClassification # noqa: F401 - except ImportError as exc: # pragma: no cover - optional dependency - return ( - False, - f"optimum[onnxruntime] not available: {exc}. Install with: pip install optimum[onnxruntime]", - ) - - try: - from transformers import AutoTokenizer # noqa: F401 - except ImportError as exc: # pragma: no cover - optional dependency - return ( - False, - f"transformers not available: {exc}. Install with: pip install transformers", - ) - - return True, None - - -def _iter_batches(items: Sequence[Any], batch_size: int) -> Iterable[Sequence[Any]]: - for i in range(0, len(items), batch_size): - yield items[i : i + batch_size] - - -def _normalize_provider_specs( - providers: Sequence[Any] | None, -) -> tuple[list[str], list[dict[str, Any]]]: - """Split execution-provider specs into Optimum-compatible names and options.""" - normalized_providers: list[str] = [] - normalized_options: list[dict[str, Any]] = [] - - for provider in providers or (): - provider_name: str | None = None - provider_options: dict[str, Any] = {} - - if isinstance(provider, tuple): - if provider: - provider_name = str(provider[0]).strip() - if len(provider) > 1 and isinstance(provider[1], dict): - provider_options = dict(provider[1]) - elif provider is not None: - provider_name = str(provider).strip() - - if not provider_name: - continue - - normalized_providers.append(provider_name) - normalized_options.append(provider_options) - - if not normalized_providers: - normalized_providers.append("CPUExecutionProvider") - normalized_options.append({}) - - return normalized_providers, normalized_options - - -class ONNXReranker(BaseReranker): - """Cross-encoder reranker using Optimum + ONNX Runtime with lazy loading.""" - - DEFAULT_MODEL = "Xenova/ms-marco-MiniLM-L-6-v2" - - def __init__( - self, - model_name: str | None = None, - *, - use_gpu: bool = True, - providers: list[Any] | None = None, - max_length: int | None = None, - ) -> None: - self.model_name = (model_name or self.DEFAULT_MODEL).strip() - if not self.model_name: - raise ValueError("model_name cannot be blank") - - self.use_gpu = bool(use_gpu) - self.providers = providers - - self.max_length = int(max_length) if max_length is not None else None - - self._tokenizer: Any | None = None - self._model: Any | None = None - self._model_input_names: set[str] | None = None - self._lock = threading.RLock() - - def _load_model(self) -> None: - if self._model is not None and self._tokenizer is not None: - return - - ok, err = check_onnx_reranker_available() - if not ok: - raise ImportError(err) - - with self._lock: - if self._model is not None and self._tokenizer is not None: - return - - from inspect import signature - - from optimum.onnxruntime import ORTModelForSequenceClassification - from transformers import AutoTokenizer - - if self.providers is None: - from ..gpu_support import get_optimal_providers - - # Include device_id options for DirectML/CUDA selection when available. - self.providers = get_optimal_providers( - use_gpu=self.use_gpu, with_device_options=True - ) - - provider_names, provider_options = _normalize_provider_specs(self.providers) - - # Some Optimum versions accept `providers`, others accept a single `provider`. - # Prefer passing the full providers list, with a conservative fallback. - model_kwargs: dict[str, Any] = {} - try: - params = signature(ORTModelForSequenceClassification.from_pretrained).parameters - if "providers" in params: - model_kwargs["providers"] = provider_names - if "provider_options" in params: - model_kwargs["provider_options"] = provider_options - elif "provider" in params: - model_kwargs["provider"] = provider_names[0] - if "provider_options" in params and provider_options[0]: - model_kwargs["provider_options"] = provider_options[0] - except Exception: - model_kwargs = {} - - try: - self._model = ORTModelForSequenceClassification.from_pretrained( - self.model_name, - **model_kwargs, - ) - except TypeError: - # Fallback for older Optimum versions: retry without provider arguments. - self._model = ORTModelForSequenceClassification.from_pretrained(self.model_name) - - self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True) - - # Cache model input names to filter tokenizer outputs defensively. - input_names: set[str] | None = None - for attr in ("input_names", "model_input_names"): - names = getattr(self._model, attr, None) - if isinstance(names, (list, tuple)) and names: - input_names = {str(n) for n in names} - break - if input_names is None: - try: - session = getattr(self._model, "model", None) - if session is not None and hasattr(session, "get_inputs"): - input_names = {i.name for i in session.get_inputs()} - except Exception: - input_names = None - self._model_input_names = input_names - - @staticmethod - def _sigmoid(x: "Any") -> "Any": - import numpy as np - - x = np.clip(x, -50.0, 50.0) - return 1.0 / (1.0 + np.exp(-x)) - - @staticmethod - def _select_relevance_logit(logits: "Any") -> "Any": - import numpy as np - - arr = np.asarray(logits) - if arr.ndim == 0: - return arr.reshape(1) - if arr.ndim == 1: - return arr - if arr.ndim >= 2: - # Common cases: - # - Regression: (batch, 1) - # - Binary classification: (batch, 2) - if arr.shape[-1] == 1: - return arr[..., 0] - if arr.shape[-1] == 2: - # Convert 2-logit softmax into a single logit via difference. - return arr[..., 1] - arr[..., 0] - return arr.max(axis=-1) - return arr.reshape(-1) - - def _tokenize_batch(self, batch: Sequence[tuple[str, str]]) -> dict[str, Any]: - if self._tokenizer is None: - raise RuntimeError("Tokenizer not loaded") # pragma: no cover - defensive - - queries = [q for q, _ in batch] - docs = [d for _, d in batch] - - tokenizer_kwargs: dict[str, Any] = { - "text": queries, - "text_pair": docs, - "padding": True, - "truncation": True, - "return_tensors": "np", - } - - max_len = self.max_length - if max_len is None: - try: - model_max = int(getattr(self._tokenizer, "model_max_length", 0) or 0) - if 0 < model_max < 10_000: - max_len = model_max - else: - max_len = 512 - except Exception: - max_len = 512 - if max_len is not None and max_len > 0: - tokenizer_kwargs["max_length"] = int(max_len) - - encoded = self._tokenizer(**tokenizer_kwargs) - inputs = dict(encoded) - - # Some models do not accept token_type_ids; filter to known input names if available. - if self._model_input_names: - inputs = {k: v for k, v in inputs.items() if k in self._model_input_names} - - return inputs - - def _forward_logits(self, inputs: dict[str, Any]) -> Any: - if self._model is None: - raise RuntimeError("Model not loaded") # pragma: no cover - defensive - - outputs = self._model(**inputs) - if hasattr(outputs, "logits"): - return outputs.logits - if isinstance(outputs, dict) and "logits" in outputs: - return outputs["logits"] - if isinstance(outputs, (list, tuple)) and outputs: - return outputs[0] - raise RuntimeError("Unexpected model output format") # pragma: no cover - defensive - - def score_pairs( - self, - pairs: Sequence[tuple[str, str]], - *, - batch_size: int = 32, - ) -> list[float]: - """Score (query, doc) pairs with sigmoid-normalized outputs in [0, 1].""" - if not pairs: - return [] - - self._load_model() - - if self._model is None or self._tokenizer is None: # pragma: no cover - defensive - return [] - - import numpy as np - - bs = int(batch_size) if batch_size and int(batch_size) > 0 else 32 - scores: list[float] = [] - - for batch in _iter_batches(list(pairs), bs): - inputs = self._tokenize_batch(batch) - logits = self._forward_logits(inputs) - rel_logits = self._select_relevance_logit(logits) - probs = self._sigmoid(rel_logits) - probs = np.clip(probs, 0.0, 1.0) - scores.extend([float(p) for p in probs.reshape(-1).tolist()]) - - if len(scores) != len(pairs): - logger.debug( - "ONNX reranker produced %d scores for %d pairs", len(scores), len(pairs) - ) - return scores[: len(pairs)] - - return scores diff --git a/codex-lens/src/codexlens/semantic/rotational_embedder.py b/codex-lens/src/codexlens/semantic/rotational_embedder.py deleted file mode 100644 index ff0f41ac..00000000 --- a/codex-lens/src/codexlens/semantic/rotational_embedder.py +++ /dev/null @@ -1,434 +0,0 @@ -"""Rotational embedder for multi-endpoint API load balancing. - -Provides intelligent load balancing across multiple LiteLLM embedding endpoints -to maximize throughput while respecting rate limits. -""" - -from __future__ import annotations - -import logging -import random -import threading -import time -from dataclasses import dataclass, field -from enum import Enum -from typing import Any, Dict, Iterable, List, Optional - -import numpy as np - -from .base import BaseEmbedder - -logger = logging.getLogger(__name__) - - -class EndpointStatus(Enum): - """Status of an API endpoint.""" - AVAILABLE = "available" - COOLING = "cooling" # Rate limited, temporarily unavailable - FAILED = "failed" # Permanent failure (auth error, etc.) - - -class SelectionStrategy(Enum): - """Strategy for selecting endpoints.""" - ROUND_ROBIN = "round_robin" - LATENCY_AWARE = "latency_aware" - WEIGHTED_RANDOM = "weighted_random" - - -@dataclass -class EndpointConfig: - """Configuration for a single API endpoint.""" - model: str - api_key: Optional[str] = None - api_base: Optional[str] = None - weight: float = 1.0 # Higher weight = more requests - max_concurrent: int = 4 # Max concurrent requests to this endpoint - - -@dataclass -class EndpointState: - """Runtime state for an endpoint.""" - config: EndpointConfig - embedder: Any = None # LiteLLMEmbedderWrapper instance - - # Health metrics - status: EndpointStatus = EndpointStatus.AVAILABLE - cooldown_until: float = 0.0 # Unix timestamp when cooldown ends - - # Performance metrics - total_requests: int = 0 - total_failures: int = 0 - avg_latency_ms: float = 0.0 - last_latency_ms: float = 0.0 - - # Concurrency tracking - active_requests: int = 0 - lock: threading.Lock = field(default_factory=threading.Lock) - - def is_available(self) -> bool: - """Check if endpoint is available for requests.""" - if self.status == EndpointStatus.FAILED: - return False - if self.status == EndpointStatus.COOLING: - if time.time() >= self.cooldown_until: - self.status = EndpointStatus.AVAILABLE - return True - return False - return True - - def set_cooldown(self, seconds: float) -> None: - """Put endpoint in cooldown state.""" - self.status = EndpointStatus.COOLING - self.cooldown_until = time.time() + seconds - logger.warning(f"Endpoint {self.config.model} cooling down for {seconds:.1f}s") - - def mark_failed(self) -> None: - """Mark endpoint as permanently failed.""" - self.status = EndpointStatus.FAILED - logger.error(f"Endpoint {self.config.model} marked as failed") - - def record_success(self, latency_ms: float) -> None: - """Record successful request.""" - self.total_requests += 1 - self.last_latency_ms = latency_ms - # Exponential moving average for latency - alpha = 0.3 - if self.avg_latency_ms == 0: - self.avg_latency_ms = latency_ms - else: - self.avg_latency_ms = alpha * latency_ms + (1 - alpha) * self.avg_latency_ms - - def record_failure(self) -> None: - """Record failed request.""" - self.total_requests += 1 - self.total_failures += 1 - - @property - def health_score(self) -> float: - """Calculate health score (0-1) based on metrics.""" - if not self.is_available(): - return 0.0 - - # Base score from success rate - if self.total_requests > 0: - success_rate = 1 - (self.total_failures / self.total_requests) - else: - success_rate = 1.0 - - # Latency factor (faster = higher score) - # Normalize: 100ms = 1.0, 1000ms = 0.1 - if self.avg_latency_ms > 0: - latency_factor = min(1.0, 100 / self.avg_latency_ms) - else: - latency_factor = 1.0 - - # Availability factor (less concurrent = more available) - if self.config.max_concurrent > 0: - availability = 1 - (self.active_requests / self.config.max_concurrent) - else: - availability = 1.0 - - # Combined score with weights - return (success_rate * 0.4 + latency_factor * 0.3 + availability * 0.3) * self.config.weight - - -class RotationalEmbedder(BaseEmbedder): - """Embedder that load balances across multiple API endpoints. - - Features: - - Intelligent endpoint selection based on latency and health - - Automatic failover on rate limits (429) and server errors - - Cooldown management to respect rate limits - - Thread-safe concurrent request handling - - Args: - endpoints: List of endpoint configurations - strategy: Selection strategy (default: latency_aware) - default_cooldown: Default cooldown seconds for rate limits (default: 60) - max_retries: Maximum retry attempts across all endpoints (default: 3) - """ - - def __init__( - self, - endpoints: List[EndpointConfig], - strategy: SelectionStrategy = SelectionStrategy.LATENCY_AWARE, - default_cooldown: float = 60.0, - max_retries: int = 3, - ) -> None: - if not endpoints: - raise ValueError("At least one endpoint must be provided") - - self.strategy = strategy - self.default_cooldown = default_cooldown - self.max_retries = max_retries - - # Initialize endpoint states - self._endpoints: List[EndpointState] = [] - self._lock = threading.Lock() - self._round_robin_index = 0 - - # Create embedder instances for each endpoint - from .litellm_embedder import LiteLLMEmbedderWrapper - - for config in endpoints: - # Build kwargs for LiteLLMEmbedderWrapper - kwargs: Dict[str, Any] = {} - if config.api_key: - kwargs["api_key"] = config.api_key - if config.api_base: - kwargs["api_base"] = config.api_base - - try: - embedder = LiteLLMEmbedderWrapper(model=config.model, **kwargs) - state = EndpointState(config=config, embedder=embedder) - self._endpoints.append(state) - logger.info(f"Initialized endpoint: {config.model}") - except Exception as e: - logger.error(f"Failed to initialize endpoint {config.model}: {e}") - - if not self._endpoints: - raise ValueError("Failed to initialize any endpoints") - - # Cache embedding properties from first endpoint - self._embedding_dim = self._endpoints[0].embedder.embedding_dim - self._model_name = f"rotational({len(self._endpoints)} endpoints)" - self._max_tokens = self._endpoints[0].embedder.max_tokens - - @property - def embedding_dim(self) -> int: - """Return embedding dimensions.""" - return self._embedding_dim - - @property - def model_name(self) -> str: - """Return model name.""" - return self._model_name - - @property - def max_tokens(self) -> int: - """Return maximum token limit.""" - return self._max_tokens - - @property - def endpoint_count(self) -> int: - """Return number of configured endpoints.""" - return len(self._endpoints) - - @property - def available_endpoint_count(self) -> int: - """Return number of available endpoints.""" - return sum(1 for ep in self._endpoints if ep.is_available()) - - def get_endpoint_stats(self) -> List[Dict[str, Any]]: - """Get statistics for all endpoints.""" - stats = [] - for ep in self._endpoints: - stats.append({ - "model": ep.config.model, - "status": ep.status.value, - "total_requests": ep.total_requests, - "total_failures": ep.total_failures, - "avg_latency_ms": round(ep.avg_latency_ms, 2), - "health_score": round(ep.health_score, 3), - "active_requests": ep.active_requests, - }) - return stats - - def _select_endpoint(self) -> Optional[EndpointState]: - """Select best available endpoint based on strategy.""" - available = [ep for ep in self._endpoints if ep.is_available()] - - if not available: - return None - - if self.strategy == SelectionStrategy.ROUND_ROBIN: - with self._lock: - self._round_robin_index = (self._round_robin_index + 1) % len(available) - return available[self._round_robin_index] - - elif self.strategy == SelectionStrategy.LATENCY_AWARE: - # Sort by health score (descending) and pick top candidate - # Add small random factor to prevent thundering herd - scored = [(ep, ep.health_score + random.uniform(0, 0.1)) for ep in available] - scored.sort(key=lambda x: x[1], reverse=True) - return scored[0][0] - - elif self.strategy == SelectionStrategy.WEIGHTED_RANDOM: - # Weighted random selection based on health scores - scores = [ep.health_score for ep in available] - total = sum(scores) - if total == 0: - return random.choice(available) - - weights = [s / total for s in scores] - return random.choices(available, weights=weights, k=1)[0] - - return available[0] - - def _parse_retry_after(self, error: Exception) -> Optional[float]: - """Extract Retry-After value from error if available.""" - error_str = str(error) - - # Try to find Retry-After in error message - import re - match = re.search(r'[Rr]etry[- ][Aa]fter[:\s]+(\d+)', error_str) - if match: - return float(match.group(1)) - - return None - - def _is_rate_limit_error(self, error: Exception) -> bool: - """Check if error is a rate limit error.""" - error_str = str(error).lower() - return any(x in error_str for x in ["429", "rate limit", "too many requests"]) - - def _is_retryable_error(self, error: Exception) -> bool: - """Check if error is retryable (not auth/config error).""" - error_str = str(error).lower() - # Retryable errors - if any(x in error_str for x in ["429", "rate limit", "502", "503", "504", - "timeout", "connection", "service unavailable"]): - return True - # Non-retryable errors (auth, config) - if any(x in error_str for x in ["401", "403", "invalid", "authentication", - "unauthorized", "api key"]): - return False - # Default to retryable for unknown errors - return True - - def embed_to_numpy(self, texts: str | Iterable[str], **kwargs) -> np.ndarray: - """Embed texts using load-balanced endpoint selection. - - Args: - texts: Single text or iterable of texts to embed. - **kwargs: Additional arguments passed to underlying embedder. - - Returns: - numpy.ndarray: Array of shape (n_texts, embedding_dim) containing embeddings. - - Raises: - RuntimeError: If all endpoints fail after retries. - """ - if isinstance(texts, str): - texts = [texts] - else: - texts = list(texts) - - last_error: Optional[Exception] = None - tried_endpoints: set = set() - - for attempt in range(self.max_retries + 1): - endpoint = self._select_endpoint() - - if endpoint is None: - # All endpoints unavailable, wait for shortest cooldown - min_cooldown = min( - (ep.cooldown_until - time.time() for ep in self._endpoints - if ep.status == EndpointStatus.COOLING), - default=self.default_cooldown - ) - if min_cooldown > 0 and attempt < self.max_retries: - wait_time = min(min_cooldown, 30) # Cap wait at 30s - logger.warning(f"All endpoints busy, waiting {wait_time:.1f}s...") - time.sleep(wait_time) - continue - break - - # Track tried endpoints to avoid infinite loops - endpoint_id = id(endpoint) - if endpoint_id in tried_endpoints and len(tried_endpoints) >= len(self._endpoints): - # Already tried all endpoints - break - tried_endpoints.add(endpoint_id) - - # Acquire slot - with endpoint.lock: - endpoint.active_requests += 1 - - try: - start_time = time.time() - result = endpoint.embedder.embed_to_numpy(texts, **kwargs) - latency_ms = (time.time() - start_time) * 1000 - - # Record success - endpoint.record_success(latency_ms) - - return result - - except Exception as e: - last_error = e - endpoint.record_failure() - - if self._is_rate_limit_error(e): - # Rate limited - set cooldown - retry_after = self._parse_retry_after(e) or self.default_cooldown - endpoint.set_cooldown(retry_after) - logger.warning(f"Endpoint {endpoint.config.model} rate limited, " - f"cooling for {retry_after}s") - - elif not self._is_retryable_error(e): - # Permanent failure (auth error, etc.) - endpoint.mark_failed() - logger.error(f"Endpoint {endpoint.config.model} failed permanently: {e}") - - else: - # Temporary error - short cooldown - endpoint.set_cooldown(5.0) - logger.warning(f"Endpoint {endpoint.config.model} error: {e}") - - finally: - with endpoint.lock: - endpoint.active_requests -= 1 - - # All retries exhausted - available = self.available_endpoint_count - raise RuntimeError( - f"All embedding attempts failed after {self.max_retries + 1} tries. " - f"Available endpoints: {available}/{len(self._endpoints)}. " - f"Last error: {last_error}" - ) - - -def create_rotational_embedder( - endpoints_config: List[Dict[str, Any]], - strategy: str = "latency_aware", - default_cooldown: float = 60.0, -) -> RotationalEmbedder: - """Factory function to create RotationalEmbedder from config dicts. - - Args: - endpoints_config: List of endpoint configuration dicts with keys: - - model: Model identifier (required) - - api_key: API key (optional) - - api_base: API base URL (optional) - - weight: Request weight (optional, default 1.0) - - max_concurrent: Max concurrent requests (optional, default 4) - strategy: Selection strategy name (round_robin, latency_aware, weighted_random) - default_cooldown: Default cooldown seconds for rate limits - - Returns: - Configured RotationalEmbedder instance - - Example config: - endpoints_config = [ - {"model": "openai/text-embedding-3-small", "api_key": "sk-..."}, - {"model": "azure/my-embedding", "api_base": "https://...", "api_key": "..."}, - ] - """ - endpoints = [] - for cfg in endpoints_config: - endpoints.append(EndpointConfig( - model=cfg["model"], - api_key=cfg.get("api_key"), - api_base=cfg.get("api_base"), - weight=cfg.get("weight", 1.0), - max_concurrent=cfg.get("max_concurrent", 4), - )) - - strategy_enum = SelectionStrategy[strategy.upper()] - - return RotationalEmbedder( - endpoints=endpoints, - strategy=strategy_enum, - default_cooldown=default_cooldown, - ) diff --git a/codex-lens/src/codexlens/semantic/vector_store.py b/codex-lens/src/codexlens/semantic/vector_store.py deleted file mode 100644 index 1dad8fbe..00000000 --- a/codex-lens/src/codexlens/semantic/vector_store.py +++ /dev/null @@ -1,1278 +0,0 @@ -"""Vector storage and similarity search for semantic chunks. - -Optimized for high-performance similarity search using: -- HNSW index for O(log N) approximate nearest neighbor search (primary) -- Cached embedding matrix for batch operations (fallback) -- NumPy vectorized cosine similarity (fallback, 100x+ faster than loops) -- Lazy content loading (only fetch for top-k results) -""" - -from __future__ import annotations - -import json -import logging -import sys -import sqlite3 -import threading -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple - -from codexlens.entities import SearchResult, SemanticChunk -from codexlens.errors import StorageError - -try: - import numpy as np - NUMPY_AVAILABLE = True -except ImportError: - np = None # type: ignore[assignment] - NUMPY_AVAILABLE = False - -# Try to import ANN index (optional hnswlib dependency) -try: - from codexlens.semantic.ann_index import ( - ANNIndex, - BinaryANNIndex, - create_ann_index, - HNSWLIB_AVAILABLE, - ) -except ImportError: - HNSWLIB_AVAILABLE = False - ANNIndex = None - BinaryANNIndex = None - create_ann_index = None - - -logger = logging.getLogger(__name__) - -# Epsilon used to guard against floating point precision edge cases (e.g., near-zero norms). -EPSILON = 1e-10 - -# SQLite INTEGER PRIMARY KEY uses signed 64-bit rowids. -SQLITE_INTEGER_MAX = (1 << 63) - 1 - - -def _validate_chunk_id_range(start_id: int, count: int) -> None: - """Validate that a batch insert can safely generate sequential chunk IDs.""" - if count <= 0: - return - - last_id = start_id + count - 1 - if last_id > sys.maxsize or last_id > SQLITE_INTEGER_MAX: - raise ValueError( - "Chunk ID range overflow: " - f"start_id={start_id}, count={count} would allocate up to {last_id}, " - f"exceeding limits (sys.maxsize={sys.maxsize}, sqlite_max={SQLITE_INTEGER_MAX}). " - "Consider cleaning up the index database or creating a new index database." - ) - - -def _validate_sql_placeholders(placeholders: str, expected_count: int) -> None: - """Validate the placeholder string used for a parameterized SQL IN clause.""" - expected = ",".join("?" * expected_count) - if placeholders != expected: - raise ValueError( - "Invalid SQL placeholders for IN clause. " - f"Expected {expected_count} '?' placeholders." - ) - - -def _cosine_similarity(a: List[float], b: List[float]) -> float: - """Compute cosine similarity between two vectors.""" - if not NUMPY_AVAILABLE: - raise ImportError("numpy required for vector operations") - - a_arr = np.array(a) - b_arr = np.array(b) - - norm_a = np.linalg.norm(a_arr) - norm_b = np.linalg.norm(b_arr) - - # Use epsilon tolerance to avoid division by (near-)zero due to floating point precision. - if norm_a < EPSILON or norm_b < EPSILON: - return 0.0 - - denom = norm_a * norm_b - if denom < EPSILON: - return 0.0 - - return float(np.dot(a_arr, b_arr) / denom) - - -class VectorStore: - """SQLite-based vector storage with HNSW-accelerated similarity search. - - Performance optimizations: - - HNSW index for O(log N) approximate nearest neighbor search - - Embedding matrix cached in memory for batch similarity computation (fallback) - - NumPy vectorized operations instead of Python loops (fallback) - - Lazy content loading - only fetch full content for top-k results - - Thread-safe cache invalidation - - Bulk insert mode for efficient batch operations - """ - - # Default embedding dimension (used when creating new index) - DEFAULT_DIM = 768 - - def __init__(self, db_path: str | Path) -> None: - if not NUMPY_AVAILABLE: - raise ImportError( - "Semantic search dependencies not available. " - "Install with: pip install codexlens[semantic]" - ) - - self.db_path = Path(db_path) - self.db_path.parent.mkdir(parents=True, exist_ok=True) - - # Embedding cache for fast similarity search (fallback) - self._cache_lock = threading.RLock() - self._embedding_matrix: Optional[np.ndarray] = None - self._embedding_norms: Optional[np.ndarray] = None - self._chunk_ids: Optional[List[int]] = None - self._cache_version: int = 0 - - # ANN index for O(log N) search - self._ann_index: Optional[ANNIndex] = None - self._ann_dim: Optional[int] = None - self._ann_write_lock = threading.Lock() # Protects ANN index modifications - - # Bulk insert mode tracking - self._bulk_insert_mode: bool = False - self._bulk_insert_ids: List[int] = [] - self._bulk_insert_embeddings: List[np.ndarray] = [] - - self._init_schema() - self._init_ann_index() - - def _init_schema(self) -> None: - """Initialize vector storage schema.""" - with sqlite3.connect(self.db_path) as conn: - # Enable memory mapping for faster reads - conn.execute("PRAGMA mmap_size = 30000000000") # 30GB limit - conn.execute(""" - CREATE TABLE IF NOT EXISTS semantic_chunks ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - file_path TEXT NOT NULL, - content TEXT NOT NULL, - embedding BLOB NOT NULL, - metadata TEXT, - category TEXT DEFAULT 'code', - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - """) - conn.execute(""" - CREATE INDEX IF NOT EXISTS idx_chunks_file - ON semantic_chunks(file_path) - """) - conn.execute(""" - CREATE INDEX IF NOT EXISTS idx_chunks_category - ON semantic_chunks(category) - """) - # Model configuration table - tracks which model generated the embeddings - conn.execute(""" - CREATE TABLE IF NOT EXISTS embeddings_config ( - id INTEGER PRIMARY KEY CHECK (id = 1), - model_profile TEXT NOT NULL, - model_name TEXT NOT NULL, - embedding_dim INTEGER NOT NULL, - backend TEXT NOT NULL DEFAULT 'fastembed', - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - """) - - # Migration: Add backend column to existing tables - self._migrate_backend_column(conn) - # Migration: Add category column - self._migrate_category_column(conn) - - conn.commit() - - def _migrate_backend_column(self, conn: sqlite3.Connection) -> None: - """Add backend column to existing embeddings_config table if not present. - - Args: - conn: Active SQLite connection - """ - # Check if backend column exists - cursor = conn.execute("PRAGMA table_info(embeddings_config)") - columns = [row[1] for row in cursor.fetchall()] - - if 'backend' not in columns: - logger.info("Migrating embeddings_config table: adding backend column") - conn.execute(""" - ALTER TABLE embeddings_config - ADD COLUMN backend TEXT NOT NULL DEFAULT 'fastembed' - """) - - def _migrate_category_column(self, conn: sqlite3.Connection) -> None: - """Add category column to existing semantic_chunks table if not present. - - Args: - conn: Active SQLite connection - """ - # Check if category column exists - cursor = conn.execute("PRAGMA table_info(semantic_chunks)") - columns = [row[1] for row in cursor.fetchall()] - - if 'category' not in columns: - logger.info("Migrating semantic_chunks table: adding category column") - conn.execute(""" - ALTER TABLE semantic_chunks - ADD COLUMN category TEXT DEFAULT 'code' - """) - # Create index for fast category filtering - conn.execute(""" - CREATE INDEX IF NOT EXISTS idx_chunks_category - ON semantic_chunks(category) - """) - - def _init_ann_index(self) -> None: - """Initialize ANN index (lazy loading from existing data).""" - if not HNSWLIB_AVAILABLE: - logger.debug("hnswlib not available, using brute-force search") - return - - # Try to detect embedding dimension from existing data - dim = self._detect_embedding_dim() - if dim is None: - # No data yet, will initialize on first add - logger.debug("No embeddings found, ANN index will be created on first add") - return - - self._ann_dim = dim - - try: - self._ann_index = ANNIndex(self.db_path, dim) - if self._ann_index.load(): - logger.debug( - "Loaded ANN index with %d vectors", self._ann_index.count() - ) - else: - # Index file doesn't exist, try to build from SQLite data - logger.debug("ANN index file not found, rebuilding from SQLite") - self._rebuild_ann_index_internal() - except Exception as e: - logger.warning("Failed to initialize ANN index: %s", e) - self._ann_index = None - - def _detect_embedding_dim(self) -> Optional[int]: - """Detect embedding dimension from existing data.""" - with sqlite3.connect(self.db_path) as conn: - row = conn.execute( - "SELECT embedding FROM semantic_chunks LIMIT 1" - ).fetchone() - if row and row[0]: - # Embedding is stored as float32 blob - blob = row[0] - return len(blob) // np.dtype(np.float32).itemsize - return None - - @property - def dimension(self) -> Optional[int]: - """Return the dimension of embeddings in the store. - - Returns: - Embedding dimension if available, None if store is empty. - """ - if self._ann_dim is not None: - return self._ann_dim - self._ann_dim = self._detect_embedding_dim() - return self._ann_dim - - def _rebuild_ann_index_internal(self) -> int: - """Internal method to rebuild ANN index from SQLite data.""" - if self._ann_index is None: - return 0 - - with sqlite3.connect(self.db_path) as conn: - conn.execute("PRAGMA mmap_size = 30000000000") - rows = conn.execute( - "SELECT id, embedding FROM semantic_chunks" - ).fetchall() - - if not rows: - return 0 - - # Extract IDs and embeddings - ids = [r[0] for r in rows] - embeddings = np.vstack([ - np.frombuffer(r[1], dtype=np.float32) for r in rows - ]) - - # Add to ANN index - self._ann_index.add_vectors(ids, embeddings) - self._ann_index.save() - - logger.info("Rebuilt ANN index with %d vectors", len(ids)) - return len(ids) - - def rebuild_ann_index(self) -> int: - """Rebuild HNSW index from all chunks in SQLite. - - Use this method to: - - Migrate existing data to use ANN search - - Repair corrupted index - - Reclaim space after many deletions - - Returns: - Number of vectors indexed. - """ - if not HNSWLIB_AVAILABLE: - logger.warning("hnswlib not available, cannot rebuild ANN index") - return 0 - - # Detect dimension - dim = self._detect_embedding_dim() - if dim is None: - logger.warning("No embeddings found, cannot rebuild ANN index") - return 0 - - self._ann_dim = dim - - # Create new index - try: - self._ann_index = ANNIndex(self.db_path, dim) - return self._rebuild_ann_index_internal() - except Exception as e: - logger.error("Failed to rebuild ANN index: %s", e) - self._ann_index = None - return 0 - - def _invalidate_cache(self) -> None: - """Invalidate the embedding cache (thread-safe).""" - with self._cache_lock: - self._embedding_matrix = None - self._embedding_norms = None - self._chunk_ids = None - self._cache_version += 1 - - def _refresh_cache(self) -> bool: - """Load embeddings into numpy matrix for fast similarity search. - - Returns: - True if cache was refreshed successfully, False if no data. - """ - with self._cache_lock: - with sqlite3.connect(self.db_path) as conn: - conn.execute("PRAGMA mmap_size = 30000000000") - rows = conn.execute( - "SELECT id, embedding FROM semantic_chunks" - ).fetchall() - - if not rows: - self._embedding_matrix = None - self._embedding_norms = None - self._chunk_ids = None - return False - - # Extract IDs and embeddings - self._chunk_ids = [r[0] for r in rows] - - # Bulk convert binary blobs to numpy matrix - embeddings = [ - np.frombuffer(r[1], dtype=np.float32) for r in rows - ] - self._embedding_matrix = np.vstack(embeddings) - - # Pre-compute norms for faster similarity calculation - self._embedding_norms = np.linalg.norm( - self._embedding_matrix, axis=1, keepdims=True - ) - # Avoid division by zero - self._embedding_norms = np.where( - self._embedding_norms == 0, EPSILON, self._embedding_norms - ) - - return True - - def _ensure_ann_index(self, dim: int) -> bool: - """Ensure ANN index is initialized with correct dimension. - - This method is thread-safe and uses double-checked locking. - - Args: - dim: Embedding dimension - - Returns: - True if ANN index is ready, False otherwise - """ - if not HNSWLIB_AVAILABLE: - return False - - # Fast path: index already initialized (no lock needed) - if self._ann_index is not None: - return True - - # Slow path: acquire lock for initialization - with self._ann_write_lock: - # Double-check after acquiring lock - if self._ann_index is not None: - return True - - try: - self._ann_dim = dim - self._ann_index = ANNIndex(self.db_path, dim) - self._ann_index.load() # Try to load existing - return True - except Exception as e: - logger.warning("Failed to initialize ANN index: %s", e) - self._ann_index = None - return False - - def add_chunk( - self, chunk: SemanticChunk, file_path: str, category: str = "code" - ) -> int: - """Add a single chunk with its embedding. - - Args: - chunk: SemanticChunk with embedding - file_path: Path to the source file - category: File category ('code' or 'doc'), default 'code' - - Returns: - The inserted chunk ID. - """ - if chunk.embedding is None: - raise ValueError("Chunk must have embedding before adding to store") - - embedding_arr = np.array(chunk.embedding, dtype=np.float32) - embedding_blob = embedding_arr.tobytes() - metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None - - with sqlite3.connect(self.db_path) as conn: - cursor = conn.execute( - """ - INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category) - VALUES (?, ?, ?, ?, ?) - """, - (file_path, chunk.content, embedding_blob, metadata_json, category) - ) - conn.commit() - chunk_id = cursor.lastrowid or 0 - - # Add to ANN index - if self._ensure_ann_index(len(chunk.embedding)): - with self._ann_write_lock: - try: - self._ann_index.add_vectors([chunk_id], embedding_arr.reshape(1, -1)) - self._ann_index.save() - except Exception as e: - logger.warning("Failed to add to ANN index: %s", e) - - # Invalidate cache after modification - self._invalidate_cache() - return chunk_id - - def add_chunks( - self, chunks: List[SemanticChunk], file_path: str, category: str = "code" - ) -> List[int]: - """Add multiple chunks with embeddings (batch insert). - - Args: - chunks: List of SemanticChunk objects with embeddings - file_path: Path to the source file - category: File category ('code' or 'doc'), default 'code' - - Returns: - List of inserted chunk IDs. - """ - if not chunks: - return [] - - # Prepare batch data - batch_data = [] - embeddings_list = [] - for chunk in chunks: - if chunk.embedding is None: - raise ValueError("All chunks must have embeddings") - embedding_arr = np.array(chunk.embedding, dtype=np.float32) - embedding_blob = embedding_arr.tobytes() - metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None - batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category)) - embeddings_list.append(embedding_arr) - - # Batch insert to SQLite - with sqlite3.connect(self.db_path) as conn: - # Get starting ID before insert - row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone() - start_id = (row[0] or 0) + 1 - - conn.executemany( - """ - INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category) - VALUES (?, ?, ?, ?, ?) - """, - batch_data - ) - conn.commit() - # Calculate inserted IDs based on starting ID - ids = list(range(start_id, start_id + len(chunks))) - - # Add to ANN index - if embeddings_list and self._ensure_ann_index(len(embeddings_list[0])): - with self._ann_write_lock: - try: - embeddings_matrix = np.vstack(embeddings_list) - self._ann_index.add_vectors(ids, embeddings_matrix) - self._ann_index.save() - except Exception as e: - logger.warning("Failed to add batch to ANN index: %s", e) - - # Invalidate cache after modification - self._invalidate_cache() - return ids - - def add_chunks_batch( - self, - chunks_with_paths: List[Tuple[SemanticChunk, str]], - update_ann: bool = True, - auto_save_ann: bool = True, - categories: Optional[List[str]] = None, - ) -> List[int]: - """Batch insert chunks from multiple files in a single transaction. - - This method is optimized for bulk operations during index generation. - - Args: - chunks_with_paths: List of (chunk, file_path) tuples - update_ann: If True, update ANN index with new vectors (default: True) - auto_save_ann: If True, save ANN index after update (default: True). - Set to False for bulk inserts to reduce I/O overhead. - categories: Optional list of categories per chunk. If None, defaults to 'code'. - If provided, must match length of chunks_with_paths. - - Returns: - List of inserted chunk IDs - """ - if not chunks_with_paths: - return [] - - batch_size = len(chunks_with_paths) - - # Validate categories if provided - if categories is not None and len(categories) != batch_size: - raise ValueError( - f"categories length ({len(categories)}) must match " - f"chunks_with_paths length ({batch_size})" - ) - - # Prepare batch data - batch_data = [] - embeddings_list = [] - for i, (chunk, file_path) in enumerate(chunks_with_paths): - if chunk.embedding is None: - raise ValueError("All chunks must have embeddings") - # Optimize: avoid repeated np.array() if already numpy - if isinstance(chunk.embedding, np.ndarray): - embedding_arr = chunk.embedding.astype(np.float32) - else: - embedding_arr = np.array(chunk.embedding, dtype=np.float32) - embedding_blob = embedding_arr.tobytes() - metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None - category = categories[i] if categories else "code" - batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category)) - embeddings_list.append(embedding_arr) - - # Batch insert to SQLite in single transaction - with sqlite3.connect(self.db_path) as conn: - # Get starting ID before insert - row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone() - start_id = (row[0] or 0) + 1 - - _validate_chunk_id_range(start_id, batch_size) - - conn.executemany( - """ - INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category) - VALUES (?, ?, ?, ?, ?) - """, - batch_data - ) - conn.commit() - # Calculate inserted IDs based on starting ID - ids = list(range(start_id, start_id + batch_size)) - - # Handle ANN index updates - if embeddings_list and update_ann and self._ensure_ann_index(len(embeddings_list[0])): - with self._ann_write_lock: - # In bulk insert mode, accumulate for later batch update - if self._bulk_insert_mode: - self._bulk_insert_ids.extend(ids) - self._bulk_insert_embeddings.extend(embeddings_list) - else: - # Normal mode: update immediately - try: - embeddings_matrix = np.vstack(embeddings_list) - self._ann_index.add_vectors(ids, embeddings_matrix) - if auto_save_ann: - self._ann_index.save() - except Exception as e: - logger.warning("Failed to add batch to ANN index: %s", e) - - # Invalidate cache after modification - self._invalidate_cache() - return ids - - def add_chunks_batch_numpy( - self, - chunks_with_paths: List[Tuple[SemanticChunk, str]], - embeddings_matrix: np.ndarray, - update_ann: bool = True, - auto_save_ann: bool = True, - categories: Optional[List[str]] = None, - ) -> List[int]: - """Batch insert chunks with pre-computed numpy embeddings matrix. - - This method accepts embeddings as a numpy matrix to avoid list->array conversions. - Useful when embeddings are already in numpy format from batch encoding. - - Args: - chunks_with_paths: List of (chunk, file_path) tuples (embeddings can be None) - embeddings_matrix: Pre-computed embeddings as (N, D) numpy array - update_ann: If True, update ANN index with new vectors (default: True) - auto_save_ann: If True, save ANN index after update (default: True) - categories: Optional list of categories per chunk. If None, defaults to 'code'. - - Returns: - List of inserted chunk IDs - """ - if not chunks_with_paths: - return [] - - batch_size = len(chunks_with_paths) - - if len(chunks_with_paths) != embeddings_matrix.shape[0]: - raise ValueError( - f"Mismatch: {len(chunks_with_paths)} chunks but " - f"{embeddings_matrix.shape[0]} embeddings" - ) - - # Validate categories if provided - if categories is not None and len(categories) != batch_size: - raise ValueError( - f"categories length ({len(categories)}) must match " - f"chunks_with_paths length ({batch_size})" - ) - - # Ensure float32 format - embeddings_matrix = embeddings_matrix.astype(np.float32) - - # Prepare batch data - batch_data = [] - for i, (chunk, file_path) in enumerate(chunks_with_paths): - embedding_arr = embeddings_matrix[i] - embedding_blob = embedding_arr.tobytes() - metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None - category = categories[i] if categories else "code" - batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category)) - - # Batch insert to SQLite in single transaction - with sqlite3.connect(self.db_path) as conn: - # Get starting ID before insert - row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone() - start_id = (row[0] or 0) + 1 - - _validate_chunk_id_range(start_id, batch_size) - - conn.executemany( - """ - INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category) - VALUES (?, ?, ?, ?, ?) - """, - batch_data - ) - conn.commit() - # Calculate inserted IDs based on starting ID - ids = list(range(start_id, start_id + batch_size)) - - # Handle ANN index updates - if update_ann and self._ensure_ann_index(embeddings_matrix.shape[1]): - with self._ann_write_lock: - # In bulk insert mode, accumulate for later batch update - if self._bulk_insert_mode: - self._bulk_insert_ids.extend(ids) - # Split matrix into individual arrays for accumulation - self._bulk_insert_embeddings.extend([embeddings_matrix[i] for i in range(len(ids))]) - else: - # Normal mode: update immediately - try: - self._ann_index.add_vectors(ids, embeddings_matrix) - if auto_save_ann: - self._ann_index.save() - except Exception as e: - logger.warning("Failed to add batch to ANN index: %s", e) - - # Invalidate cache after modification - self._invalidate_cache() - return ids - - def begin_bulk_insert(self) -> None: - """Begin bulk insert mode - disable ANN auto-update for better performance. - - Usage: - store.begin_bulk_insert() - try: - for batch in batches: - store.add_chunks_batch(batch, auto_save_ann=False) - finally: - store.end_bulk_insert() - - Or use context manager: - with store.bulk_insert(): - for batch in batches: - store.add_chunks_batch(batch) - """ - with self._ann_write_lock: - self._bulk_insert_mode = True - self._bulk_insert_ids.clear() - self._bulk_insert_embeddings.clear() - logger.debug("Entered bulk insert mode") - - def end_bulk_insert(self) -> None: - """End bulk insert mode and rebuild ANN index from accumulated data. - - This method should be called after all bulk inserts are complete to - update the ANN index in a single batch operation. - """ - with self._ann_write_lock: - if not self._bulk_insert_mode: - logger.warning("end_bulk_insert called but not in bulk insert mode") - return - - self._bulk_insert_mode = False - bulk_ids = list(self._bulk_insert_ids) - bulk_embeddings = list(self._bulk_insert_embeddings) - self._bulk_insert_ids.clear() - self._bulk_insert_embeddings.clear() - - # Update ANN index with accumulated data. - if bulk_ids and bulk_embeddings: - if self._ensure_ann_index(len(bulk_embeddings[0])): - with self._ann_write_lock: - try: - embeddings_matrix = np.vstack(bulk_embeddings) - self._ann_index.add_vectors(bulk_ids, embeddings_matrix) - self._ann_index.save() - logger.info( - "Bulk insert complete: added %d vectors to ANN index", - len(bulk_ids), - ) - except Exception as e: - logger.error("Failed to update ANN index after bulk insert: %s", e) - - logger.debug("Exited bulk insert mode") - - class BulkInsertContext: - """Context manager for bulk insert operations.""" - - def __init__(self, store: "VectorStore") -> None: - self.store = store - - def __enter__(self) -> "VectorStore": - self.store.begin_bulk_insert() - return self.store - - def __exit__(self, exc_type, exc_val, exc_tb) -> None: - self.store.end_bulk_insert() - - def bulk_insert(self) -> "VectorStore.BulkInsertContext": - """Return a context manager for bulk insert operations. - - Usage: - with store.bulk_insert(): - for batch in batches: - store.add_chunks_batch(batch) - """ - return self.BulkInsertContext(self) - - def delete_file_chunks(self, file_path: str) -> int: - """Delete all chunks for a file. - - Returns: - Number of deleted chunks. - """ - # Get chunk IDs before deletion (for ANN index) - chunk_ids_to_delete = [] - if self._ann_index is not None: - with sqlite3.connect(self.db_path) as conn: - rows = conn.execute( - "SELECT id FROM semantic_chunks WHERE file_path = ?", - (file_path,) - ).fetchall() - chunk_ids_to_delete = [r[0] for r in rows] - - # Delete from SQLite - with sqlite3.connect(self.db_path) as conn: - cursor = conn.execute( - "DELETE FROM semantic_chunks WHERE file_path = ?", - (file_path,) - ) - conn.commit() - deleted = cursor.rowcount - - # Remove from ANN index - if deleted > 0 and self._ann_index is not None and chunk_ids_to_delete: - with self._ann_write_lock: - try: - self._ann_index.remove_vectors(chunk_ids_to_delete) - self._ann_index.save() - except Exception as e: - logger.warning("Failed to remove from ANN index: %s", e) - - if deleted > 0: - self._invalidate_cache() - return deleted - - def search_similar( - self, - query_embedding: List[float], - top_k: int = 10, - min_score: float = 0.0, - return_full_content: bool = True, - category: Optional[str] = None, - ) -> List[SearchResult]: - """Find chunks most similar to query embedding. - - Uses HNSW index for O(log N) search when available, falls back to - brute-force NumPy search otherwise. - - Args: - query_embedding: Query vector. - top_k: Maximum results to return. - min_score: Minimum cosine similarity score in [0.0, 1.0]. - return_full_content: If True, return full code block content. - category: Optional category filter ('code' or 'doc'). If None, returns all. - - Returns: - List of SearchResult ordered by similarity (highest first). - """ - query_vec = np.array(query_embedding, dtype=np.float32) - - if not 0.0 <= min_score <= 1.0: - raise ValueError( - f"Invalid min_score: {min_score}. Must be within [0.0, 1.0] for cosine similarity." - ) - - # Try HNSW search first (O(log N)) - if ( - HNSWLIB_AVAILABLE - and self._ann_index is not None - and self._ann_index.is_loaded - and self._ann_index.count() > 0 - ): - try: - return self._search_with_ann( - query_vec, top_k, min_score, return_full_content, category - ) - except Exception as e: - logger.warning("ANN search failed, falling back to brute-force: %s", e) - - # Fallback to brute-force search (O(N)) - return self._search_brute_force( - query_vec, top_k, min_score, return_full_content, category - ) - - def _search_with_ann( - self, - query_vec: np.ndarray, - top_k: int, - min_score: float, - return_full_content: bool, - category: Optional[str] = None, - ) -> List[SearchResult]: - """Search using HNSW index (O(log N)). - - Args: - query_vec: Query vector as numpy array - top_k: Maximum results to return - min_score: Minimum cosine similarity score in [0.0, 1.0] - return_full_content: If True, return full code block content - category: Optional category filter ('code' or 'doc') - - Returns: - List of SearchResult ordered by similarity (highest first) - """ - # Limit top_k to available vectors to prevent hnswlib error - ann_count = self._ann_index.count() - # When category filtering, fetch more candidates to compensate for filtering - fetch_k = top_k * 3 if category else top_k - effective_top_k = min(fetch_k, ann_count) if ann_count > 0 else 0 - - if effective_top_k == 0: - return [] - - # HNSW search returns (ids, distances) - # For cosine space: distance = 1 - similarity - ids, distances = self._ann_index.search(query_vec, effective_top_k) - - if ids is None or distances is None: - logger.debug( - "ANN search returned null results (ids=%s, distances=%s)", - ids, - distances, - ) - return [] - - if len(ids) == 0 or len(distances) == 0: - logger.debug( - "ANN search returned empty results (ids=%s, distances=%s)", - ids, - distances, - ) - return [] - - if len(ids) != len(distances): - logger.warning( - "ANN search returned mismatched result lengths (%d ids, %d distances)", - len(ids), - len(distances), - ) - return [] - - # Convert distances to similarity scores - scores = [1.0 - d for d in distances] - - # Filter by min_score - filtered = [ - (chunk_id, score) - for chunk_id, score in zip(ids, scores) - if score >= min_score - ] - - if not filtered: - return [] - - top_ids = [f[0] for f in filtered] - top_scores = [f[1] for f in filtered] - - # Fetch content from SQLite with category filtering - results = self._fetch_results_by_ids( - top_ids, top_scores, return_full_content, category - ) - # Apply final limit after category filtering - return results[:top_k] - - def _search_brute_force( - self, - query_vec: np.ndarray, - top_k: int, - min_score: float, - return_full_content: bool, - category: Optional[str] = None, - ) -> List[SearchResult]: - """Brute-force search using NumPy (O(N) fallback). - - Args: - query_vec: Query vector as numpy array - top_k: Maximum results to return - min_score: Minimum cosine similarity score in [0.0, 1.0] - return_full_content: If True, return full code block content - category: Optional category filter ('code' or 'doc') - - Returns: - List of SearchResult ordered by similarity (highest first) - """ - logger.warning( - "Using brute-force vector search (hnswlib not available). " - "This may cause high memory usage for large indexes. " - "Install hnswlib for better performance: pip install hnswlib" - ) - - with self._cache_lock: - # Refresh cache if needed - if self._embedding_matrix is None: - if not self._refresh_cache(): - return [] # No data - - # Vectorized cosine similarity - query_vec = query_vec.reshape(1, -1) - query_norm = np.linalg.norm(query_vec) - if query_norm == 0: - return [] - - # Compute all similarities at once: (N,) scores - # similarity = (A @ B.T) / (||A|| * ||B||) - dot_products = np.dot(self._embedding_matrix, query_vec.T).flatten() - scores = dot_products / (self._embedding_norms.flatten() * query_norm) - - # Filter by min_score and get top-k indices - valid_mask = scores >= min_score - valid_indices = np.where(valid_mask)[0] - - if len(valid_indices) == 0: - return [] - - # When category filtering, fetch more candidates to compensate for filtering - fetch_k = top_k * 3 if category else top_k - - # Sort by score descending and take top candidates - valid_scores = scores[valid_indices] - sorted_order = np.argsort(valid_scores)[::-1][:fetch_k] - top_indices = valid_indices[sorted_order] - top_scores = valid_scores[sorted_order] - - # Get chunk IDs for top results - top_ids = [self._chunk_ids[i] for i in top_indices] - - # Fetch content only for top-k results (lazy loading) with category filtering - results = self._fetch_results_by_ids( - top_ids, top_scores.tolist(), return_full_content, category - ) - # Apply final limit after category filtering - return results[:top_k] - - def _fetch_results_by_ids( - self, - chunk_ids: List[int], - scores: List[float], - return_full_content: bool, - category: Optional[str] = None, - ) -> List[SearchResult]: - """Fetch full result data for specific chunk IDs. - - Args: - chunk_ids: List of chunk IDs to fetch. - scores: Corresponding similarity scores. - return_full_content: Whether to include full content. - category: Optional category filter ('code' or 'doc'). - - Returns: - List of SearchResult objects. - """ - if not chunk_ids: - return [] - - # Build parameterized query for IN clause - placeholders = ",".join("?" * len(chunk_ids)) - _validate_sql_placeholders(placeholders, len(chunk_ids)) - - # SQL injection prevention: - # - Only a validated placeholders string (commas + '?') is interpolated into the query. - # - User-provided values are passed separately via sqlite3 parameters. - # - Category filter is added as a separate parameter - if category: - query = """ - SELECT id, file_path, content, metadata - FROM semantic_chunks - WHERE id IN ({placeholders}) AND category = ? - """.format(placeholders=placeholders) - params = list(chunk_ids) + [category] - else: - query = """ - SELECT id, file_path, content, metadata - FROM semantic_chunks - WHERE id IN ({placeholders}) - """.format(placeholders=placeholders) - params = chunk_ids - - with sqlite3.connect(self.db_path) as conn: - conn.execute("PRAGMA mmap_size = 30000000000") - rows = conn.execute(query, params).fetchall() - - # Build ID -> row mapping - id_to_row = {r[0]: r for r in rows} - - results = [] - for chunk_id, score in zip(chunk_ids, scores): - row = id_to_row.get(chunk_id) - if not row: - continue - - _, file_path, content, metadata_json = row - metadata = json.loads(metadata_json) if metadata_json else {} - - # Build excerpt (short preview) - excerpt = content[:200] + "..." if len(content) > 200 else content - - # Extract symbol information from metadata - symbol_name = metadata.get("symbol_name") - symbol_kind = metadata.get("symbol_kind") - start_line = metadata.get("start_line") - end_line = metadata.get("end_line") - - # Build Symbol object if we have symbol info - symbol = None - if symbol_name and symbol_kind and start_line and end_line: - try: - from codexlens.entities import Symbol - symbol = Symbol( - name=symbol_name, - kind=symbol_kind, - range=(start_line, end_line) - ) - except Exception: - pass - - results.append(SearchResult( - path=file_path, - score=score, - excerpt=excerpt, - content=content if return_full_content else None, - symbol=symbol, - metadata=metadata, - start_line=start_line, - end_line=end_line, - symbol_name=symbol_name, - symbol_kind=symbol_kind, - )) - - return results - - def count_chunks(self) -> int: - """Count total chunks in store.""" - with sqlite3.connect(self.db_path) as conn: - row = conn.execute("SELECT COUNT(*) FROM semantic_chunks").fetchone() - return row[0] if row else 0 - - def get_all_chunks(self) -> List[SemanticChunk]: - """Get all chunks from the store. - - Returns: - List of SemanticChunk objects with id and content. - """ - with sqlite3.connect(self.db_path) as conn: - conn.row_factory = sqlite3.Row - rows = conn.execute( - "SELECT id, file_path, content, metadata FROM semantic_chunks" - ).fetchall() - - chunks = [] - for row in rows: - chunks.append(SemanticChunk( - id=row["id"], - content=row["content"], - file_path=row["file_path"], - metadata=json.loads(row["metadata"]) if row["metadata"] else None, - )) - return chunks - - def clear_cache(self) -> None: - """Manually clear the embedding cache.""" - self._invalidate_cache() - - @property - def ann_available(self) -> bool: - """Check if ANN index is available and ready.""" - return ( - HNSWLIB_AVAILABLE - and self._ann_index is not None - and self._ann_index.is_loaded - ) - - @property - def ann_count(self) -> int: - """Get number of vectors in ANN index.""" - if self._ann_index is not None: - return self._ann_index.count() - return 0 - - def get_model_config(self) -> Optional[Dict[str, Any]]: - """Get the model configuration used for embeddings in this store. - - Returns: - Dictionary with model_profile, model_name, embedding_dim, backend, or None if not set. - """ - with sqlite3.connect(self.db_path) as conn: - row = conn.execute( - "SELECT model_profile, model_name, embedding_dim, backend, created_at, updated_at " - "FROM embeddings_config WHERE id = 1" - ).fetchone() - if row: - return { - "model_profile": row[0], - "model_name": row[1], - "embedding_dim": row[2], - "backend": row[3], - "created_at": row[4], - "updated_at": row[5], - } - return None - - def set_model_config( - self, model_profile: str, model_name: str, embedding_dim: int, backend: str = 'fastembed' - ) -> None: - """Set the model configuration for embeddings in this store. - - This should be called when generating new embeddings. If a different - model was previously used, this will update the configuration. - - Args: - model_profile: Model profile name (fast, code, minilm, etc.) - model_name: Full model name (e.g., jinaai/jina-embeddings-v2-base-code) - embedding_dim: Embedding dimension (e.g., 768) - backend: Backend used for embeddings (fastembed or litellm, default: fastembed) - """ - with sqlite3.connect(self.db_path) as conn: - conn.execute( - """ - INSERT INTO embeddings_config (id, model_profile, model_name, embedding_dim, backend) - VALUES (1, ?, ?, ?, ?) - ON CONFLICT(id) DO UPDATE SET - model_profile = excluded.model_profile, - model_name = excluded.model_name, - embedding_dim = excluded.embedding_dim, - backend = excluded.backend, - updated_at = CURRENT_TIMESTAMP - """, - (model_profile, model_name, embedding_dim, backend) - ) - conn.commit() - - def check_model_compatibility( - self, model_profile: str, model_name: str, embedding_dim: int - ) -> Tuple[bool, Optional[str]]: - """Check if the given model is compatible with existing embeddings. - - Args: - model_profile: Model profile to check - model_name: Model name to check - embedding_dim: Embedding dimension to check - - Returns: - Tuple of (is_compatible, warning_message). - is_compatible is True if no existing config or configs match. - warning_message is a user-friendly message if incompatible. - """ - existing = self.get_model_config() - if existing is None: - return True, None - - # Check dimension first (most critical) - if existing["embedding_dim"] != embedding_dim: - return False, ( - f"Dimension mismatch: existing embeddings use {existing['embedding_dim']}d " - f"({existing['model_profile']}), but requested model uses {embedding_dim}d " - f"({model_profile}). Use --force to regenerate all embeddings." - ) - - # Check model (different models with same dimension may have different semantic spaces) - if existing["model_profile"] != model_profile: - return False, ( - f"Model mismatch: existing embeddings use '{existing['model_profile']}' " - f"({existing['model_name']}), but requested '{model_profile}' " - f"({model_name}). Use --force to regenerate all embeddings." - ) - - return True, None - - def close(self) -> None: - """Close the vector store and release resources. - - This ensures SQLite connections are closed and ANN index is cleared, - allowing temporary files to be deleted on Windows. - """ - with self._cache_lock: - self._embedding_matrix = None - self._embedding_norms = None - self._chunk_ids = None - - with self._ann_write_lock: - self._ann_index = None - - def __enter__(self) -> "VectorStore": - """Context manager entry.""" - return self - - def __exit__(self, exc_type, exc_val, exc_tb) -> None: - """Context manager exit - close resources.""" - self.close() diff --git a/codex-lens/src/codexlens/storage/__init__.py b/codex-lens/src/codexlens/storage/__init__.py deleted file mode 100644 index 815bc961..00000000 --- a/codex-lens/src/codexlens/storage/__init__.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Storage backends for CodexLens.""" - -from __future__ import annotations - -from .sqlite_store import SQLiteStore -from .path_mapper import PathMapper -from .registry import RegistryStore, ProjectInfo, DirMapping -from .dir_index import DirIndexStore, SubdirLink, FileEntry -from .index_tree import IndexTreeBuilder, BuildResult, DirBuildResult -from .vector_meta_store import VectorMetadataStore - -__all__ = [ - # Legacy (workspace-local) - "SQLiteStore", - # Path mapping - "PathMapper", - # Global registry - "RegistryStore", - "ProjectInfo", - "DirMapping", - # Directory index - "DirIndexStore", - "SubdirLink", - "FileEntry", - # Tree builder - "IndexTreeBuilder", - "BuildResult", - "DirBuildResult", - # Vector metadata - "VectorMetadataStore", -] - diff --git a/codex-lens/src/codexlens/storage/deepwiki_models.py b/codex-lens/src/codexlens/storage/deepwiki_models.py deleted file mode 100644 index e86665c8..00000000 --- a/codex-lens/src/codexlens/storage/deepwiki_models.py +++ /dev/null @@ -1,120 +0,0 @@ -"""Pydantic models for DeepWiki index storage. - -DeepWiki stores mappings between source files, symbols, and generated documentation -for the DeepWiki documentation generation system. -""" - -from __future__ import annotations - -from datetime import datetime -from typing import Any, List, Optional, Tuple - -from pydantic import BaseModel, Field, field_validator - - -class DeepWikiSymbol(BaseModel): - """A symbol record in the DeepWiki index. - - Maps a code symbol to its generated documentation file and anchor. - """ - - id: Optional[int] = Field(default=None, description="Database row ID") - name: str = Field(..., min_length=1, description="Symbol name (function, class, etc.)") - type: str = Field(..., min_length=1, description="Symbol type (function, class, method, variable)") - source_file: str = Field(..., min_length=1, description="Path to source file containing the symbol") - doc_file: str = Field(..., min_length=1, description="Path to generated documentation file") - anchor: str = Field(..., min_length=1, description="HTML anchor ID for linking to specific section") - line_range: Tuple[int, int] = Field( - ..., - description="(start_line, end_line) in source file, 1-based inclusive" - ) - created_at: Optional[datetime] = Field(default=None, description="Record creation timestamp") - updated_at: Optional[datetime] = Field(default=None, description="Record update timestamp") - staleness_score: float = Field(default=0.0, ge=0.0, le=1.0, description="Staleness score (0.0=fresh, 1.0=stale)") - last_checked_commit: Optional[str] = Field(default=None, description="Git commit hash at last freshness check") - last_checked_at: Optional[float] = Field(default=None, description="Timestamp of last freshness check") - staleness_factors: Optional[dict[str, Any]] = Field(default=None, description="JSON factors contributing to staleness score") - - @field_validator("line_range") - @classmethod - def validate_line_range(cls, value: Tuple[int, int]) -> Tuple[int, int]: - """Validate line range is proper tuple with start <= end.""" - if len(value) != 2: - raise ValueError("line_range must be a (start_line, end_line) tuple") - start_line, end_line = value - if start_line < 1 or end_line < 1: - raise ValueError("line_range lines must be >= 1") - if end_line < start_line: - raise ValueError("end_line must be >= start_line") - return value - - @field_validator("name", "type", "source_file", "doc_file", "anchor") - @classmethod - def strip_and_validate_nonempty(cls, value: str) -> str: - """Strip whitespace and validate non-empty.""" - cleaned = value.strip() - if not cleaned: - raise ValueError("value cannot be blank") - return cleaned - - -class DeepWikiDoc(BaseModel): - """A documentation file record in the DeepWiki index. - - Tracks generated documentation files and their associated symbols. - """ - - id: Optional[int] = Field(default=None, description="Database row ID") - path: str = Field(..., min_length=1, description="Path to documentation file") - content_hash: str = Field(..., min_length=1, description="SHA256 hash of file content for change detection") - symbols: List[str] = Field( - default_factory=list, - description="List of symbol names documented in this file" - ) - generated_at: datetime = Field( - default_factory=datetime.utcnow, - description="Timestamp when documentation was generated" - ) - llm_tool: Optional[str] = Field( - default=None, - description="LLM tool used to generate documentation (gemini/qwen)" - ) - - @field_validator("path", "content_hash") - @classmethod - def strip_and_validate_nonempty(cls, value: str) -> str: - """Strip whitespace and validate non-empty.""" - cleaned = value.strip() - if not cleaned: - raise ValueError("value cannot be blank") - return cleaned - - -class DeepWikiFile(BaseModel): - """A source file record in the DeepWiki index. - - Tracks indexed source files and their content hashes for incremental updates. - """ - - id: Optional[int] = Field(default=None, description="Database row ID") - path: str = Field(..., min_length=1, description="Path to source file") - content_hash: str = Field(..., min_length=1, description="SHA256 hash of file content") - last_indexed: datetime = Field( - default_factory=datetime.utcnow, - description="Timestamp when file was last indexed" - ) - symbols_count: int = Field(default=0, ge=0, description="Number of symbols indexed from this file") - docs_generated: bool = Field(default=False, description="Whether documentation has been generated") - staleness_score: float = Field(default=0.0, ge=0.0, le=1.0, description="Staleness score (0.0=fresh, 1.0=stale)") - last_checked_commit: Optional[str] = Field(default=None, description="Git commit hash at last freshness check") - last_checked_at: Optional[float] = Field(default=None, description="Timestamp of last freshness check") - staleness_factors: Optional[dict[str, Any]] = Field(default=None, description="JSON factors contributing to staleness score") - - @field_validator("path", "content_hash") - @classmethod - def strip_and_validate_nonempty(cls, value: str) -> str: - """Strip whitespace and validate non-empty.""" - cleaned = value.strip() - if not cleaned: - raise ValueError("value cannot be blank") - return cleaned diff --git a/codex-lens/src/codexlens/storage/deepwiki_store.py b/codex-lens/src/codexlens/storage/deepwiki_store.py deleted file mode 100644 index dc7bd32b..00000000 --- a/codex-lens/src/codexlens/storage/deepwiki_store.py +++ /dev/null @@ -1,1404 +0,0 @@ -"""DeepWiki SQLite storage for documentation index. - -Stores mappings between source files, code symbols, and generated documentation -for the DeepWiki documentation generation system. - -Schema: -- deepwiki_files: Tracked source files with content hashes -- deepwiki_docs: Generated documentation files -- deepwiki_symbols: Symbol-to-documentation mappings -""" - -from __future__ import annotations - -import hashlib -import json -import logging -import math # noqa: F401 - used in calculate_staleness_score -import sqlite3 -import threading -import time -from datetime import datetime -from pathlib import Path -from typing import Any, Dict, List, Optional - -from codexlens.errors import StorageError -from codexlens.storage.deepwiki_models import DeepWikiDoc, DeepWikiFile, DeepWikiSymbol - -logger = logging.getLogger(__name__) - - -class DeepWikiStore: - """SQLite storage for DeepWiki documentation index. - - Provides: - - File tracking with content hashes for incremental updates - - Symbol-to-documentation mappings for navigation - - Documentation file metadata tracking - - Thread-safe with connection pooling and WAL mode. - """ - - DEFAULT_DB_PATH = Path.home() / ".codexlens" / "deepwiki_index.db" - SCHEMA_VERSION = 2 - - def __init__(self, db_path: Path | None = None) -> None: - """Initialize DeepWiki store. - - Args: - db_path: Path to SQLite database file. Uses default if None. - """ - self.db_path = (db_path or self.DEFAULT_DB_PATH).resolve() - self._lock = threading.RLock() - self._local = threading.local() - self._pool_lock = threading.Lock() - self._pool: Dict[int, sqlite3.Connection] = {} - self._pool_generation = 0 - - def _get_connection(self) -> sqlite3.Connection: - """Get or create a thread-local database connection. - - Each thread gets its own connection with WAL mode enabled. - """ - thread_id = threading.get_ident() - if getattr(self._local, "generation", None) == self._pool_generation: - conn = getattr(self._local, "conn", None) - if conn is not None: - return conn - - with self._pool_lock: - conn = self._pool.get(thread_id) - if conn is None: - conn = sqlite3.connect(self.db_path, check_same_thread=False) - conn.row_factory = sqlite3.Row - conn.execute("PRAGMA journal_mode=WAL") - conn.execute("PRAGMA synchronous=NORMAL") - conn.execute("PRAGMA foreign_keys=ON") - self._pool[thread_id] = conn - - self._local.conn = conn - self._local.generation = self._pool_generation - return conn - - def close(self) -> None: - """Close all pooled connections.""" - with self._lock: - with self._pool_lock: - for conn in self._pool.values(): - conn.close() - self._pool.clear() - self._pool_generation += 1 - - if hasattr(self._local, "conn"): - self._local.conn = None - if hasattr(self._local, "generation"): - self._local.generation = self._pool_generation - - def __enter__(self) -> DeepWikiStore: - self.initialize() - return self - - def __exit__(self, exc_type: object, exc: object, tb: object) -> None: - self.close() - - def initialize(self) -> None: - """Create database and schema if not exists.""" - with self._lock: - self.db_path.parent.mkdir(parents=True, exist_ok=True) - conn = self._get_connection() - self._create_schema(conn) - - def _create_schema(self, conn: sqlite3.Connection) -> None: - """Create DeepWiki database schema.""" - try: - # Schema version tracking - conn.execute( - """ - CREATE TABLE IF NOT EXISTS deepwiki_schema ( - version INTEGER PRIMARY KEY, - applied_at REAL - ) - """ - ) - - # Files table: track indexed source files - conn.execute( - """ - CREATE TABLE IF NOT EXISTS deepwiki_files ( - id INTEGER PRIMARY KEY, - path TEXT UNIQUE NOT NULL, - content_hash TEXT NOT NULL, - last_indexed REAL NOT NULL, - symbols_count INTEGER DEFAULT 0, - docs_generated INTEGER DEFAULT 0, - staleness_score REAL DEFAULT 0.0, - last_checked_commit TEXT, - last_checked_at REAL, - staleness_factors TEXT - ) - """ - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_deepwiki_files_path ON deepwiki_files(path)" - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_deepwiki_files_hash ON deepwiki_files(content_hash)" - ) - - # Docs table: track generated documentation files - conn.execute( - """ - CREATE TABLE IF NOT EXISTS deepwiki_docs ( - id INTEGER PRIMARY KEY, - path TEXT UNIQUE NOT NULL, - content_hash TEXT NOT NULL, - symbols TEXT DEFAULT '[]', - generated_at REAL NOT NULL, - llm_tool TEXT - ) - """ - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_deepwiki_docs_path ON deepwiki_docs(path)" - ) - - # Symbols table: map source symbols to documentation - conn.execute( - """ - CREATE TABLE IF NOT EXISTS deepwiki_symbols ( - id INTEGER PRIMARY KEY, - name TEXT NOT NULL, - type TEXT NOT NULL, - source_file TEXT NOT NULL, - doc_file TEXT NOT NULL, - anchor TEXT NOT NULL, - start_line INTEGER NOT NULL, - end_line INTEGER NOT NULL, - created_at REAL, - updated_at REAL, - staleness_score REAL DEFAULT 0.0, - last_checked_commit TEXT, - last_checked_at REAL, - staleness_factors TEXT, - UNIQUE(name, source_file) - ) - """ - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_deepwiki_symbols_name ON deepwiki_symbols(name)" - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_deepwiki_symbols_source ON deepwiki_symbols(source_file)" - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_deepwiki_symbols_doc ON deepwiki_symbols(doc_file)" - ) - - # Generation progress table for LLM document generation tracking - conn.execute( - """ - CREATE TABLE IF NOT EXISTS generation_progress ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - symbol_key TEXT NOT NULL UNIQUE, - file_path TEXT NOT NULL, - symbol_name TEXT NOT NULL, - symbol_type TEXT NOT NULL, - layer INTEGER NOT NULL, - source_hash TEXT NOT NULL, - status TEXT NOT NULL DEFAULT 'pending', - attempts INTEGER DEFAULT 0, - last_tool TEXT, - last_error TEXT, - generated_at REAL, - created_at REAL, - updated_at REAL - ) - """ - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_progress_status ON generation_progress(status)" - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_progress_file ON generation_progress(file_path)" - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_progress_hash ON generation_progress(source_hash)" - ) - - # Record schema version - conn.execute( - """ - INSERT OR IGNORE INTO deepwiki_schema(version, applied_at) - VALUES(?, ?) - """, - (self.SCHEMA_VERSION, time.time()), - ) - - # Schema v2 migration: add staleness columns - staleness_columns = [ - ("deepwiki_files", "staleness_score", "REAL DEFAULT 0.0"), - ("deepwiki_files", "last_checked_commit", "TEXT"), - ("deepwiki_files", "last_checked_at", "REAL"), - ("deepwiki_files", "staleness_factors", "TEXT"), - ("deepwiki_symbols", "staleness_score", "REAL DEFAULT 0.0"), - ("deepwiki_symbols", "last_checked_commit", "TEXT"), - ("deepwiki_symbols", "last_checked_at", "REAL"), - ("deepwiki_symbols", "staleness_factors", "TEXT"), - ] - for table, col, col_type in staleness_columns: - try: - conn.execute(f"ALTER TABLE {table} ADD COLUMN {col} {col_type}") - except sqlite3.OperationalError: - pass # Column already exists - - # Legacy migration: some earlier DeepWiki DBs stored timestamps as TEXT (ISO strings). - # better-sqlite3 + JS code expects numeric (REAL) seconds, so ensure timestamp columns - # have REAL affinity by rebuilding affected tables when needed. - self._migrate_text_timestamps_to_real(conn) - - conn.commit() - except sqlite3.DatabaseError as exc: - raise StorageError( - f"Failed to initialize DeepWiki schema: {exc}", - db_path=str(self.db_path), - operation="initialize", - ) from exc - - def _normalize_path(self, path: str | Path) -> str: - """Normalize path for storage (forward slashes). - - Args: - path: Path to normalize. - - Returns: - Normalized path string with forward slashes. - """ - return str(Path(path).resolve()).replace("\\", "/") - - def _migrate_text_timestamps_to_real(self, conn: sqlite3.Connection) -> None: - """Migrate legacy TEXT timestamp columns to REAL affinity. - - SQLite's type system is dynamic, but column affinity influences how values are stored and - returned. Older DeepWiki databases used TEXT timestamps (often ISO strings). The current - schema uses REAL epoch seconds. When we detect TEXT affinity on timestamp columns, we - rebuild the table with REAL columns and convert existing values during copy. - """ - - self._rebuild_table_with_timestamp_conversion( - conn, - table="deepwiki_files", - create_sql=""" - CREATE TABLE deepwiki_files ( - id INTEGER PRIMARY KEY, - path TEXT UNIQUE NOT NULL, - content_hash TEXT NOT NULL, - last_indexed REAL NOT NULL, - symbols_count INTEGER DEFAULT 0, - docs_generated INTEGER DEFAULT 0, - staleness_score REAL DEFAULT 0.0, - last_checked_commit TEXT, - last_checked_at REAL, - staleness_factors TEXT - ) - """, - timestamp_columns={"last_indexed", "last_checked_at"}, - required_timestamp_columns={"last_indexed"}, - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_deepwiki_files_path ON deepwiki_files(path)" - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_deepwiki_files_hash ON deepwiki_files(content_hash)" - ) - - self._rebuild_table_with_timestamp_conversion( - conn, - table="deepwiki_docs", - create_sql=""" - CREATE TABLE deepwiki_docs ( - id INTEGER PRIMARY KEY, - path TEXT UNIQUE NOT NULL, - content_hash TEXT NOT NULL, - symbols TEXT DEFAULT '[]', - generated_at REAL NOT NULL, - llm_tool TEXT - ) - """, - timestamp_columns={"generated_at"}, - required_timestamp_columns={"generated_at"}, - ) - conn.execute("CREATE INDEX IF NOT EXISTS idx_deepwiki_docs_path ON deepwiki_docs(path)") - - self._rebuild_table_with_timestamp_conversion( - conn, - table="deepwiki_symbols", - create_sql=""" - CREATE TABLE deepwiki_symbols ( - id INTEGER PRIMARY KEY, - name TEXT NOT NULL, - type TEXT NOT NULL, - source_file TEXT NOT NULL, - doc_file TEXT NOT NULL, - anchor TEXT NOT NULL, - start_line INTEGER NOT NULL, - end_line INTEGER NOT NULL, - created_at REAL, - updated_at REAL, - staleness_score REAL DEFAULT 0.0, - last_checked_commit TEXT, - last_checked_at REAL, - staleness_factors TEXT, - UNIQUE(name, source_file) - ) - """, - timestamp_columns={"created_at", "updated_at", "last_checked_at"}, - required_timestamp_columns=set(), - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_deepwiki_symbols_name ON deepwiki_symbols(name)" - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_deepwiki_symbols_source ON deepwiki_symbols(source_file)" - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_deepwiki_symbols_doc ON deepwiki_symbols(doc_file)" - ) - - self._rebuild_table_with_timestamp_conversion( - conn, - table="generation_progress", - create_sql=""" - CREATE TABLE generation_progress ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - symbol_key TEXT NOT NULL UNIQUE, - file_path TEXT NOT NULL, - symbol_name TEXT NOT NULL, - symbol_type TEXT NOT NULL, - layer INTEGER NOT NULL, - source_hash TEXT NOT NULL, - status TEXT NOT NULL DEFAULT 'pending', - attempts INTEGER DEFAULT 0, - last_tool TEXT, - last_error TEXT, - generated_at REAL, - created_at REAL, - updated_at REAL - ) - """, - timestamp_columns={"generated_at", "created_at", "updated_at"}, - required_timestamp_columns=set(), - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_progress_status ON generation_progress(status)" - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_progress_file ON generation_progress(file_path)" - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_progress_hash ON generation_progress(source_hash)" - ) - - def _rebuild_table_with_timestamp_conversion( - self, - conn: sqlite3.Connection, - *, - table: str, - create_sql: str, - timestamp_columns: set[str], - required_timestamp_columns: set[str], - ) -> None: - info = conn.execute(f"PRAGMA table_info({table})").fetchall() - if not info: - return - - declared_types = { - row["name"]: str(row["type"] or "").strip().upper() for row in info - } - needs_migration = any( - declared_types.get(col) == "TEXT" for col in timestamp_columns if col in declared_types - ) - if not needs_migration: - return - - old_table = f"{table}__old_ts" - conn.execute(f"ALTER TABLE {table} RENAME TO {old_table}") - conn.execute(create_sql) - - old_cols = [ - r["name"] - for r in conn.execute(f"PRAGMA table_info({old_table})").fetchall() - ] - new_cols = [r["name"] for r in conn.execute(f"PRAGMA table_info({table})").fetchall()] - common_cols = [c for c in new_cols if c in old_cols] - - select_exprs: list[str] = [] - for col in common_cols: - if col in timestamp_columns: - expr = self._sql_timestamp_to_real(col) - if col in required_timestamp_columns: - expr = f"COALESCE({expr}, CAST(strftime('%s','now') AS REAL))" - select_exprs.append(f"{expr} AS {col}") - else: - select_exprs.append(col) - - cols_sql = ", ".join(common_cols) - select_sql = ", ".join(select_exprs) - conn.execute( - f"INSERT INTO {table} ({cols_sql}) SELECT {select_sql} FROM {old_table}" - ) - conn.execute(f"DROP TABLE {old_table}") - - def _sql_timestamp_to_real(self, col: str) -> str: - # Convert various timestamp representations to epoch seconds (REAL). - # - numeric types: keep as REAL - # - numeric strings: CAST to REAL - # - ISO datetime strings: strftime('%s', ...) to epoch seconds - return f"""( - CASE - WHEN {col} IS NULL THEN NULL - WHEN typeof({col}) IN ('integer', 'real') THEN CAST({col} AS REAL) - WHEN trim({col}) GLOB '[0-9]*' THEN CAST({col} AS REAL) - ELSE CAST(strftime('%s', replace(substr({col}, 1, 19), 'T', ' ')) AS REAL) - END - )""" - - # === File Operations === - - def add_file( - self, - file_path: str | Path, - content_hash: str, - symbols_count: int = 0, - docs_generated: bool = False, - ) -> DeepWikiFile: - """Add or update a tracked source file. - - Args: - file_path: Path to the source file. - content_hash: SHA256 hash of file content. - symbols_count: Number of symbols indexed from this file. - docs_generated: Whether documentation has been generated. - - Returns: - DeepWikiFile record. - """ - with self._lock: - conn = self._get_connection() - path_str = self._normalize_path(file_path) - now = time.time() - - conn.execute( - """ - INSERT INTO deepwiki_files(path, content_hash, last_indexed, symbols_count, docs_generated) - VALUES(?, ?, ?, ?, ?) - ON CONFLICT(path) DO UPDATE SET - content_hash=excluded.content_hash, - last_indexed=excluded.last_indexed, - symbols_count=excluded.symbols_count, - docs_generated=excluded.docs_generated - """, - (path_str, content_hash, now, symbols_count, 1 if docs_generated else 0), - ) - conn.commit() - - row = conn.execute( - "SELECT * FROM deepwiki_files WHERE path=?", (path_str,) - ).fetchone() - - if not row: - raise StorageError( - f"Failed to add file: {file_path}", - db_path=str(self.db_path), - operation="add_file", - ) - - return self._row_to_deepwiki_file(row) - - def get_file(self, file_path: str | Path) -> Optional[DeepWikiFile]: - """Get a tracked file by path. - - Args: - file_path: Path to the source file. - - Returns: - DeepWikiFile if found, None otherwise. - """ - with self._lock: - conn = self._get_connection() - path_str = self._normalize_path(file_path) - row = conn.execute( - "SELECT * FROM deepwiki_files WHERE path=?", (path_str,) - ).fetchone() - return self._row_to_deepwiki_file(row) if row else None - - def get_file_hash(self, file_path: str | Path) -> Optional[str]: - """Get content hash for a file. - - Used for incremental update detection. - - Args: - file_path: Path to the source file. - - Returns: - SHA256 content hash if file is tracked, None otherwise. - """ - with self._lock: - conn = self._get_connection() - path_str = self._normalize_path(file_path) - row = conn.execute( - "SELECT content_hash FROM deepwiki_files WHERE path=?", (path_str,) - ).fetchone() - return row["content_hash"] if row else None - - def update_file_hash(self, file_path: str | Path, content_hash: str) -> None: - """Update content hash for a tracked file. - - Args: - file_path: Path to the source file. - content_hash: New SHA256 hash of file content. - """ - with self._lock: - conn = self._get_connection() - path_str = self._normalize_path(file_path) - now = time.time() - - conn.execute( - """ - UPDATE deepwiki_files - SET content_hash=?, last_indexed=? - WHERE path=? - """, - (content_hash, now, path_str), - ) - conn.commit() - - def update_file_staleness( - self, - file_path: str | Path, - staleness_score: float, - commit: str | None = None, - factors: Dict[str, Any] | None = None, - ) -> None: - """Update staleness data for a tracked file. - - Args: - file_path: Path to the source file. - staleness_score: Staleness score (0.0-1.0). - commit: Git commit hash at check time. - factors: Dict of factors contributing to the score. - """ - with self._lock: - conn = self._get_connection() - path_str = self._normalize_path(file_path) - now = time.time() - factors_json = json.dumps(factors) if factors else None - - conn.execute( - """ - UPDATE deepwiki_files - SET staleness_score=?, last_checked_commit=?, last_checked_at=?, staleness_factors=? - WHERE path=? - """, - (staleness_score, commit, now, factors_json, path_str), - ) - conn.commit() - - def update_symbol_staleness( - self, - name: str, - source_file: str | Path, - staleness_score: float, - commit: str | None = None, - factors: Dict[str, Any] | None = None, - ) -> None: - """Update staleness data for a symbol. - - Args: - name: Symbol name. - source_file: Path to the source file. - staleness_score: Staleness score (0.0-1.0). - commit: Git commit hash at check time. - factors: Dict of factors contributing to the score. - """ - with self._lock: - conn = self._get_connection() - path_str = self._normalize_path(source_file) - now = time.time() - factors_json = json.dumps(factors) if factors else None - - conn.execute( - """ - UPDATE deepwiki_symbols - SET staleness_score=?, last_checked_commit=?, last_checked_at=?, staleness_factors=? - WHERE name=? AND source_file=? - """, - (staleness_score, commit, now, factors_json, name, path_str), - ) - conn.commit() - - def remove_file(self, file_path: str | Path) -> bool: - """Remove a tracked file and its associated symbols. - - Args: - file_path: Path to the source file. - - Returns: - True if file was removed, False if not found. - """ - with self._lock: - conn = self._get_connection() - path_str = self._normalize_path(file_path) - - row = conn.execute( - "SELECT id FROM deepwiki_files WHERE path=?", (path_str,) - ).fetchone() - - if not row: - return False - - # Delete associated symbols first - conn.execute("DELETE FROM deepwiki_symbols WHERE source_file=?", (path_str,)) - conn.execute("DELETE FROM deepwiki_files WHERE path=?", (path_str,)) - conn.commit() - return True - - def list_files( - self, needs_docs: bool = False, limit: int = 1000 - ) -> List[DeepWikiFile]: - """List tracked files. - - Args: - needs_docs: If True, only return files that need documentation generated. - limit: Maximum number of files to return. - - Returns: - List of DeepWikiFile records. - """ - with self._lock: - conn = self._get_connection() - - if needs_docs: - rows = conn.execute( - """ - SELECT * FROM deepwiki_files - WHERE docs_generated = 0 - ORDER BY last_indexed DESC - LIMIT ? - """, - (limit,), - ).fetchall() - else: - rows = conn.execute( - """ - SELECT * FROM deepwiki_files - ORDER BY last_indexed DESC - LIMIT ? - """, - (limit,), - ).fetchall() - - return [self._row_to_deepwiki_file(row) for row in rows] - - def get_stats(self) -> Dict[str, int]: - """Get statistics about the DeepWiki index. - - Returns: - Dictionary with counts of files, symbols, and docs. - """ - with self._lock: - conn = self._get_connection() - - files_count = conn.execute( - "SELECT COUNT(*) as count FROM deepwiki_files" - ).fetchone()["count"] - - symbols_count = conn.execute( - "SELECT COUNT(*) as count FROM deepwiki_symbols" - ).fetchone()["count"] - - docs_count = conn.execute( - "SELECT COUNT(*) as count FROM deepwiki_docs" - ).fetchone()["count"] - - return { - "files_count": files_count, - "symbols_count": symbols_count, - "docs_count": docs_count, - } - - # === Symbol Operations === - - def add_symbol(self, symbol: DeepWikiSymbol) -> DeepWikiSymbol: - """Add or update a symbol in the index. - - Args: - symbol: DeepWikiSymbol to add. - - Returns: - DeepWikiSymbol with ID populated. - """ - with self._lock: - conn = self._get_connection() - source_file = self._normalize_path(symbol.source_file) - doc_file = self._normalize_path(symbol.doc_file) - now = time.time() - - conn.execute( - """ - INSERT INTO deepwiki_symbols( - name, type, source_file, doc_file, anchor, - start_line, end_line, created_at, updated_at - ) - VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?) - ON CONFLICT(name, source_file) DO UPDATE SET - type=excluded.type, - doc_file=excluded.doc_file, - anchor=excluded.anchor, - start_line=excluded.start_line, - end_line=excluded.end_line, - updated_at=excluded.updated_at - """, - ( - symbol.name, - symbol.type, - source_file, - doc_file, - symbol.anchor, - symbol.line_range[0], - symbol.line_range[1], - now, - now, - ), - ) - conn.commit() - - row = conn.execute( - """ - SELECT * FROM deepwiki_symbols - WHERE name=? AND source_file=? - """, - (symbol.name, source_file), - ).fetchone() - - if not row: - raise StorageError( - f"Failed to add symbol: {symbol.name}", - db_path=str(self.db_path), - operation="add_symbol", - ) - - return self._row_to_deepwiki_symbol(row) - - def get_symbols_for_file(self, file_path: str | Path) -> List[DeepWikiSymbol]: - """Get all symbols for a source file. - - Args: - file_path: Path to the source file. - - Returns: - List of DeepWikiSymbol records for the file. - """ - with self._lock: - conn = self._get_connection() - path_str = self._normalize_path(file_path) - rows = conn.execute( - """ - SELECT * FROM deepwiki_symbols - WHERE source_file=? - ORDER BY start_line - """, - (path_str,), - ).fetchall() - return [self._row_to_deepwiki_symbol(row) for row in rows] - - def get_symbol(self, name: str, source_file: str | Path) -> Optional[DeepWikiSymbol]: - """Get a specific symbol by name and source file. - - Args: - name: Symbol name. - source_file: Path to the source file. - - Returns: - DeepWikiSymbol if found, None otherwise. - """ - with self._lock: - conn = self._get_connection() - path_str = self._normalize_path(source_file) - row = conn.execute( - """ - SELECT * FROM deepwiki_symbols - WHERE name=? AND source_file=? - """, - (name, path_str), - ).fetchone() - return self._row_to_deepwiki_symbol(row) if row else None - - def search_symbols(self, query: str, limit: int = 50) -> List[DeepWikiSymbol]: - """Search symbols by name. - - Args: - query: Search query (supports LIKE pattern). - limit: Maximum number of results. - - Returns: - List of matching DeepWikiSymbol records. - """ - with self._lock: - conn = self._get_connection() - pattern = f"%{query}%" - rows = conn.execute( - """ - SELECT * FROM deepwiki_symbols - WHERE name LIKE ? - ORDER BY name - LIMIT ? - """, - (pattern, limit), - ).fetchall() - return [self._row_to_deepwiki_symbol(row) for row in rows] - - def delete_symbols_for_file(self, file_path: str | Path) -> int: - """Delete all symbols for a source file. - - Args: - file_path: Path to the source file. - - Returns: - Number of symbols deleted. - """ - with self._lock: - conn = self._get_connection() - path_str = self._normalize_path(file_path) - cursor = conn.execute( - "DELETE FROM deepwiki_symbols WHERE source_file=?", (path_str,) - ) - conn.commit() - return cursor.rowcount - - # === Doc Operations === - - def add_doc(self, doc: DeepWikiDoc) -> DeepWikiDoc: - """Add or update a documentation file record. - - Args: - doc: DeepWikiDoc to add. - - Returns: - DeepWikiDoc with ID populated. - """ - with self._lock: - conn = self._get_connection() - path_str = self._normalize_path(doc.path) - symbols_json = json.dumps(doc.symbols) - now = time.time() - - conn.execute( - """ - INSERT INTO deepwiki_docs(path, content_hash, symbols, generated_at, llm_tool) - VALUES(?, ?, ?, ?, ?) - ON CONFLICT(path) DO UPDATE SET - content_hash=excluded.content_hash, - symbols=excluded.symbols, - generated_at=excluded.generated_at, - llm_tool=excluded.llm_tool - """, - (path_str, doc.content_hash, symbols_json, now, doc.llm_tool), - ) - conn.commit() - - row = conn.execute( - "SELECT * FROM deepwiki_docs WHERE path=?", (path_str,) - ).fetchone() - - if not row: - raise StorageError( - f"Failed to add doc: {doc.path}", - db_path=str(self.db_path), - operation="add_doc", - ) - - return self._row_to_deepwiki_doc(row) - - def get_doc(self, doc_path: str | Path) -> Optional[DeepWikiDoc]: - """Get a documentation file by path. - - Args: - doc_path: Path to the documentation file. - - Returns: - DeepWikiDoc if found, None otherwise. - """ - with self._lock: - conn = self._get_connection() - path_str = self._normalize_path(doc_path) - row = conn.execute( - "SELECT * FROM deepwiki_docs WHERE path=?", (path_str,) - ).fetchone() - return self._row_to_deepwiki_doc(row) if row else None - - def list_docs(self, limit: int = 1000) -> List[DeepWikiDoc]: - """List all documentation files. - - Args: - limit: Maximum number of docs to return. - - Returns: - List of DeepWikiDoc records. - """ - with self._lock: - conn = self._get_connection() - rows = conn.execute( - """ - SELECT * FROM deepwiki_docs - ORDER BY generated_at DESC - LIMIT ? - """, - (limit,), - ).fetchall() - return [self._row_to_deepwiki_doc(row) for row in rows] - - def delete_doc(self, doc_path: str | Path) -> bool: - """Delete a documentation file record. - - Args: - doc_path: Path to the documentation file. - - Returns: - True if deleted, False if not found. - """ - with self._lock: - conn = self._get_connection() - path_str = self._normalize_path(doc_path) - - row = conn.execute( - "SELECT id FROM deepwiki_docs WHERE path=?", (path_str,) - ).fetchone() - - if not row: - return False - - conn.execute("DELETE FROM deepwiki_docs WHERE path=?", (path_str,)) - conn.commit() - return True - - # === Utility Methods === - - def compute_file_hash(self, file_path: str | Path) -> str: - """Compute SHA256 hash of a file's content. - - Args: - file_path: Path to the file. - - Returns: - SHA256 hash string. - """ - sha256 = hashlib.sha256() - path = Path(file_path) - if not path.exists(): - raise FileNotFoundError(f"File not found: {file_path}") - - with open(path, "rb") as f: - for chunk in iter(lambda: f.read(8192), b""): - sha256.update(chunk) - - return sha256.hexdigest() - - @staticmethod - def calculate_staleness_score( - days_since_update: float, - commits_since: int = 0, - files_changed: int = 0, - lines_changed: int = 0, - proportion_changed: float = 0.0, - is_deleted: bool = False, - weights: tuple[float, float, float] = (0.1, 0.4, 0.5), - decay_k: float = 0.05, - ) -> float: - """Calculate staleness score using three-factor formula. - - S = min(1.0, w_t * T + w_c * C + w_s * M) - - Args: - days_since_update: Days since last documentation update. - commits_since: Number of commits since last check. - files_changed: Number of files changed. - lines_changed: Total lines changed. - proportion_changed: Proportion of symbol body changed (0.0-1.0). - is_deleted: Whether the symbol was deleted. - weights: (w_t, w_c, w_s) weights for time, churn, symbol factors. - decay_k: Time decay constant (default 0.05, ~14 days to 50%). - - Returns: - Staleness score between 0.0 and 1.0. - """ - # Deleted symbols are maximally stale - if is_deleted: - return 1.0 - - w_t, w_c, w_s = weights - - # T: Time decay factor - T = 1 - math.exp(-decay_k * max(0, days_since_update)) - - # C: Code churn factor (sigmoid normalization) - churn_raw = ( - math.log1p(commits_since) - + math.log1p(files_changed) - + math.log1p(lines_changed) - ) - C = 1 / (1 + math.exp(-churn_raw + 3)) # sigmoid centered at 3 - - # M: Symbol modification factor - M = min(1.0, max(0.0, proportion_changed)) - - return min(1.0, w_t * T + w_c * C + w_s * M) - - def get_stale_files( - self, files: list[dict[str, str]] - ) -> list[dict[str, str]]: - """Check which files have stale documentation by comparing hashes. - - Args: - files: List of dicts with 'path' and 'hash' keys. - - Returns: - List of file dicts where stored hash differs from provided hash. - """ - with self._lock: - conn = self._get_connection() - if not files: - return [] - - # Build lookup: normalized_path -> original file dict - lookup: dict[str, dict[str, str]] = {} - normalized: list[str] = [] - for f in files: - path_str = self._normalize_path(f["path"]) - lookup[path_str] = f - normalized.append(path_str) - - placeholders = ",".join("?" * len(normalized)) - rows = conn.execute( - f"SELECT path, content_hash FROM deepwiki_files WHERE path IN ({placeholders})", - normalized, - ).fetchall() - - stored: dict[str, str] = {row["path"]: row["content_hash"] for row in rows} - - stale = [] - for path_str, f in lookup.items(): - stored_hash = stored.get(path_str) - if stored_hash is None: - stale.append({"path": f["path"], "stored_hash": None, "current_hash": f["hash"]}) - elif stored_hash != f["hash"]: - stale.append({"path": f["path"], "stored_hash": stored_hash, "current_hash": f["hash"]}) - - return stale - - def get_symbols_for_paths( - self, paths: list[str | Path] - ) -> dict[str, list[DeepWikiSymbol]]: - """Get all symbols for multiple source files. - - Args: - paths: List of source file paths. - - Returns: - Dict mapping normalized path to list of DeepWikiSymbol records. - """ - with self._lock: - conn = self._get_connection() - result: dict[str, list[DeepWikiSymbol]] = {} - - if not paths: - return result - - normalized = [self._normalize_path(p) for p in paths] - placeholders = ",".join("?" * len(normalized)) - rows = conn.execute( - f""" - SELECT * FROM deepwiki_symbols - WHERE source_file IN ({placeholders}) - ORDER BY source_file, start_line - """, - normalized, - ).fetchall() - - for row in rows: - sf = row["source_file"] - result.setdefault(sf, []).append( - self._row_to_deepwiki_symbol(row) - ) - - return result - - def stats(self) -> Dict[str, Any]: - """Get storage statistics. - - Returns: - Dict with counts and metadata. - """ - with self._lock: - conn = self._get_connection() - file_count = conn.execute( - "SELECT COUNT(*) AS c FROM deepwiki_files" - ).fetchone()["c"] - symbol_count = conn.execute( - "SELECT COUNT(*) AS c FROM deepwiki_symbols" - ).fetchone()["c"] - doc_count = conn.execute( - "SELECT COUNT(*) AS c FROM deepwiki_docs" - ).fetchone()["c"] - files_needing_docs = conn.execute( - "SELECT COUNT(*) AS c FROM deepwiki_files WHERE docs_generated = 0" - ).fetchone()["c"] - - return { - "files": int(file_count), - "symbols": int(symbol_count), - "docs": int(doc_count), - "files_needing_docs": int(files_needing_docs), - "db_path": str(self.db_path), - } - - # === Generation Progress Operations === - - def get_progress(self, symbol_key: str) -> Optional[Dict[str, Any]]: - """Get generation progress for a symbol. - - Args: - symbol_key: Unique symbol identifier (file_path:symbol_name:line_start). - - Returns: - Progress record dict if found, None otherwise. - """ - with self._lock: - conn = self._get_connection() - row = conn.execute( - "SELECT * FROM generation_progress WHERE symbol_key=?", - (symbol_key,), - ).fetchone() - return dict(row) if row else None - - def update_progress(self, symbol_key: str, data: Dict[str, Any]) -> None: - """Update or create generation progress for a symbol. - - Args: - symbol_key: Unique symbol identifier (file_path:symbol_name:line_start). - data: Dict with fields to update (file_path, symbol_name, symbol_type, - layer, source_hash, status, attempts, last_tool, last_error, generated_at). - """ - with self._lock: - conn = self._get_connection() - now = time.time() - - # Build update query dynamically - fields = list(data.keys()) - placeholders = ["?"] * len(fields) - values = [data[f] for f in fields] - - conn.execute( - f""" - INSERT INTO generation_progress(symbol_key, {', '.join(fields)}, created_at, updated_at) - VALUES(?, {', '.join(placeholders)}, ?, ?) - ON CONFLICT(symbol_key) DO UPDATE SET - {', '.join(f'{f}=excluded.{f}' for f in fields)}, - updated_at=excluded.updated_at - """, - [symbol_key] + values + [now, now], - ) - conn.commit() - - def mark_completed(self, symbol_key: str, tool: str) -> None: - """Mark a symbol's documentation as completed. - - Args: - symbol_key: Unique symbol identifier. - tool: The LLM tool that generated the documentation. - """ - with self._lock: - conn = self._get_connection() - now = time.time() - - conn.execute( - """ - UPDATE generation_progress - SET status='completed', last_tool=?, generated_at=?, updated_at=? - WHERE symbol_key=? - """, - (tool, now, now, symbol_key), - ) - conn.commit() - - def mark_failed(self, symbol_key: str, error: str, tool: str | None = None) -> None: - """Mark a symbol's documentation generation as failed. - - Args: - symbol_key: Unique symbol identifier. - error: Error message describing the failure. - tool: The LLM tool that was used (optional). - """ - with self._lock: - conn = self._get_connection() - now = time.time() - - if tool: - conn.execute( - """ - UPDATE generation_progress - SET status='failed', last_error=?, last_tool=?, - attempts=attempts+1, updated_at=? - WHERE symbol_key=? - """, - (error, tool, now, symbol_key), - ) - else: - conn.execute( - """ - UPDATE generation_progress - SET status='failed', last_error=?, attempts=attempts+1, updated_at=? - WHERE symbol_key=? - """, - (error, now, symbol_key), - ) - conn.commit() - - def get_pending_symbols(self, limit: int = 1000) -> List[Dict[str, Any]]: - """Get all symbols with pending or failed status for retry. - - Args: - limit: Maximum number of records to return. - - Returns: - List of progress records with pending or failed status. - """ - with self._lock: - conn = self._get_connection() - rows = conn.execute( - """ - SELECT * FROM generation_progress - WHERE status IN ('pending', 'failed') - ORDER BY updated_at ASC - LIMIT ? - """, - (limit,), - ).fetchall() - return [dict(row) for row in rows] - - def get_completed_symbol_keys(self) -> set: - """Get set of all completed symbol keys for orphan detection. - - Returns: - Set of symbol_key strings for completed symbols. - """ - with self._lock: - conn = self._get_connection() - rows = conn.execute( - "SELECT symbol_key FROM generation_progress WHERE status='completed'" - ).fetchall() - return {row["symbol_key"] for row in rows} - - def delete_progress(self, symbol_keys: List[str]) -> int: - """Delete progress records for orphaned symbols. - - Args: - symbol_keys: List of symbol keys to delete. - - Returns: - Number of records deleted. - """ - if not symbol_keys: - return 0 - - with self._lock: - conn = self._get_connection() - placeholders = ",".join("?" * len(symbol_keys)) - cursor = conn.execute( - f"DELETE FROM generation_progress WHERE symbol_key IN ({placeholders})", - symbol_keys, - ) - conn.commit() - return cursor.rowcount - - # === Row Conversion Methods === - - def _row_to_deepwiki_file(self, row: sqlite3.Row) -> DeepWikiFile: - """Convert database row to DeepWikiFile.""" - staleness_factors = None - try: - factors_str = row["staleness_factors"] - if factors_str: - staleness_factors = json.loads(factors_str) - except (KeyError, IndexError): - pass - - return DeepWikiFile( - id=int(row["id"]), - path=row["path"], - content_hash=row["content_hash"], - last_indexed=datetime.fromtimestamp(row["last_indexed"]) - if row["last_indexed"] - else datetime.utcnow(), - symbols_count=int(row["symbols_count"]) if row["symbols_count"] else 0, - docs_generated=bool(row["docs_generated"]), - staleness_score=float(row["staleness_score"]) if row["staleness_score"] else 0.0, - last_checked_commit=row["last_checked_commit"] if "last_checked_commit" in row.keys() else None, - last_checked_at=row["last_checked_at"] if "last_checked_at" in row.keys() else None, - staleness_factors=staleness_factors, - ) - - def _row_to_deepwiki_symbol(self, row: sqlite3.Row) -> DeepWikiSymbol: - """Convert database row to DeepWikiSymbol.""" - created_at = None - if row["created_at"]: - created_at = datetime.fromtimestamp(row["created_at"]) - - updated_at = None - if row["updated_at"]: - updated_at = datetime.fromtimestamp(row["updated_at"]) - - staleness_factors = None - try: - factors_str = row["staleness_factors"] - if factors_str: - staleness_factors = json.loads(factors_str) - except (KeyError, IndexError): - pass - - return DeepWikiSymbol( - id=int(row["id"]), - name=row["name"], - type=row["type"], - source_file=row["source_file"], - doc_file=row["doc_file"], - anchor=row["anchor"], - line_range=(int(row["start_line"]), int(row["end_line"])), - created_at=created_at, - updated_at=updated_at, - staleness_score=float(row["staleness_score"]) if row["staleness_score"] else 0.0, - last_checked_commit=row["last_checked_commit"] if "last_checked_commit" in row.keys() else None, - last_checked_at=row["last_checked_at"] if "last_checked_at" in row.keys() else None, - staleness_factors=staleness_factors, - ) - - def _row_to_deepwiki_doc(self, row: sqlite3.Row) -> DeepWikiDoc: - """Convert database row to DeepWikiDoc.""" - symbols = [] - if row["symbols"]: - try: - symbols = json.loads(row["symbols"]) - except json.JSONDecodeError: - pass - - generated_at = datetime.utcnow() - if row["generated_at"]: - generated_at = datetime.fromtimestamp(row["generated_at"]) - - return DeepWikiDoc( - id=int(row["id"]), - path=row["path"], - content_hash=row["content_hash"], - symbols=symbols, - generated_at=generated_at, - llm_tool=row["llm_tool"], - ) diff --git a/codex-lens/src/codexlens/storage/dir_index.py b/codex-lens/src/codexlens/storage/dir_index.py deleted file mode 100644 index ee9e11c5..00000000 --- a/codex-lens/src/codexlens/storage/dir_index.py +++ /dev/null @@ -1,2358 +0,0 @@ -"""Single-directory index storage with hierarchical linking. - -Each directory maintains its own _index.db with: -- Files in the current directory -- Links to subdirectory indexes -- Full-text search via FTS5 -- Symbol table for code navigation -""" - -from __future__ import annotations - -import logging -import hashlib -import re -import sqlite3 -import threading -import time -from dataclasses import dataclass -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple - -from codexlens.config import Config -from codexlens.entities import CodeRelationship, SearchResult, Symbol -from codexlens.errors import StorageError -from codexlens.storage.global_index import GlobalSymbolIndex - - -@dataclass -class SubdirLink: - """Link to a subdirectory's index database.""" - - id: int - name: str - index_path: Path - files_count: int - last_updated: float - - -@dataclass -class FileEntry: - """Metadata for an indexed file in current directory.""" - - id: int - name: str - full_path: Path - language: str - mtime: float - line_count: int - - -class DirIndexStore: - """Single-directory index storage with hierarchical subdirectory linking. - - Each directory has an independent _index.db containing: - - Files table: Files in this directory only - - Subdirs table: Links to child directory indexes - - Symbols table: Code symbols from files - - FTS5 index: Full-text search on file content - - Thread-safe operations with WAL mode enabled. - """ - - # Schema version for migration tracking - # Increment this when schema changes require migration - SCHEMA_VERSION = 8 - - def __init__( - self, - db_path: str | Path, - *, - config: Config | None = None, - global_index: GlobalSymbolIndex | None = None, - ) -> None: - """Initialize directory index store. - - Args: - db_path: Path to _index.db file for this directory - """ - self.db_path = Path(db_path).resolve() - self._lock = threading.RLock() - self._conn: Optional[sqlite3.Connection] = None - self.logger = logging.getLogger(__name__) - self._config = config - self._global_index = global_index - - def initialize(self) -> None: - """Create database and schema if not exists.""" - with self._lock: - self.db_path.parent.mkdir(parents=True, exist_ok=True) - conn = self._get_connection() - - # Check current schema version - current_version = self._get_schema_version(conn) - - # Fail gracefully if database is from a newer version - if current_version > self.SCHEMA_VERSION: - raise StorageError( - f"Database schema version {current_version} is newer than " - f"supported version {self.SCHEMA_VERSION}. " - f"Please update the application or use a compatible database.", - db_path=str(self.db_path), - operation="initialize", - details={ - "current_version": current_version, - "supported_version": self.SCHEMA_VERSION - } - ) - - # Create or migrate schema - if current_version == 0: - # New database - create schema directly - self._create_schema(conn) - self._create_fts_triggers(conn) - self._set_schema_version(conn, self.SCHEMA_VERSION) - elif current_version < self.SCHEMA_VERSION: - # Existing database - apply migrations - self._apply_migrations(conn, current_version) - self._set_schema_version(conn, self.SCHEMA_VERSION) - - conn.commit() - - def _get_schema_version(self, conn: sqlite3.Connection) -> int: - """Get current schema version from database.""" - try: - row = conn.execute("PRAGMA user_version").fetchone() - return row[0] if row else 0 - except Exception: - return 0 - - def _set_schema_version(self, conn: sqlite3.Connection, version: int) -> None: - """Set schema version in database.""" - conn.execute(f"PRAGMA user_version = {version}") - - def _apply_migrations(self, conn: sqlite3.Connection, from_version: int) -> None: - """Apply schema migrations from current version to latest. - - Args: - conn: Database connection - from_version: Current schema version - """ - # Migration v0/v1 -> v2: Add 'name' column to files table - if from_version < 2: - self._migrate_v2_add_name_column(conn) - - # Migration v2 -> v4: Add dual FTS tables (exact + fuzzy) - if from_version < 4: - from codexlens.storage.migrations.migration_004_dual_fts import upgrade - upgrade(conn) - - # Migration v4 -> v5: Remove unused/redundant fields - if from_version < 5: - from codexlens.storage.migrations.migration_005_cleanup_unused_fields import upgrade - upgrade(conn) - - # Migration v5 -> v6: Ensure relationship tables/indexes exist - if from_version < 6: - from codexlens.storage.migrations.migration_006_enhance_relationships import upgrade - upgrade(conn) - - # Migration v6 -> v7: Add graph neighbor cache for search expansion - if from_version < 7: - from codexlens.storage.migrations.migration_007_add_graph_neighbors import upgrade - upgrade(conn) - - # Migration v7 -> v8: Add Merkle hashes for incremental change detection - if from_version < 8: - from codexlens.storage.migrations.migration_008_add_merkle_hashes import upgrade - upgrade(conn) - - def close(self) -> None: - """Close database connection.""" - with self._lock: - if self._conn is not None: - try: - self._conn.close() - except Exception: - pass - finally: - self._conn = None - - def __enter__(self) -> DirIndexStore: - """Context manager entry.""" - self.initialize() - return self - - def __exit__(self, exc_type: object, exc: object, tb: object) -> None: - """Context manager exit.""" - self.close() - - # === File Operations === - - def add_file( - self, - name: str, - full_path: str | Path, - content: str, - language: str, - symbols: Optional[List[Symbol]] = None, - relationships: Optional[List[CodeRelationship]] = None, - ) -> int: - """Add or update a file in the current directory index. - - Args: - name: Filename without path - full_path: Complete source file path - content: File content for indexing - language: Programming language identifier - symbols: List of Symbol objects from the file - relationships: Optional list of CodeRelationship edges from this file - - Returns: - Database file_id - - Raises: - StorageError: If database operations fail - """ - with self._lock: - conn = self._get_connection() - full_path_str = str(Path(full_path).resolve()) - mtime = Path(full_path_str).stat().st_mtime if Path(full_path_str).exists() else None - line_count = content.count('\n') + 1 - - try: - conn.execute( - """ - INSERT INTO files(name, full_path, language, content, mtime, line_count) - VALUES(?, ?, ?, ?, ?, ?) - ON CONFLICT(full_path) DO UPDATE SET - name=excluded.name, - language=excluded.language, - content=excluded.content, - mtime=excluded.mtime, - line_count=excluded.line_count - """, - (name, full_path_str, language, content, mtime, line_count), - ) - - row = conn.execute("SELECT id FROM files WHERE full_path=?", (full_path_str,)).fetchone() - if not row: - raise StorageError(f"Failed to retrieve file_id for {full_path_str}") - - file_id = int(row["id"]) - - # Replace symbols - conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,)) - if symbols: - # Insert symbols without token_count and symbol_type - symbol_rows = [] - for s in symbols: - symbol_rows.append( - (file_id, s.name, s.kind, s.range[0], s.range[1]) - ) - - conn.executemany( - """ - INSERT INTO symbols(file_id, name, kind, start_line, end_line) - VALUES(?, ?, ?, ?, ?) - """, - symbol_rows, - ) - - self._save_merkle_hash(conn, file_id=file_id, content=content) - self._save_relationships(conn, file_id=file_id, relationships=relationships) - conn.commit() - self._maybe_update_global_symbols(full_path_str, symbols or []) - return file_id - - except sqlite3.DatabaseError as exc: - conn.rollback() - raise StorageError(f"Failed to add file {name}: {exc}") from exc - - def save_relationships(self, file_id: int, relationships: List[CodeRelationship]) -> None: - """Save relationships for an already-indexed file. - - Args: - file_id: Database file id - relationships: Relationship edges to persist - """ - if not relationships: - return - with self._lock: - conn = self._get_connection() - self._save_relationships(conn, file_id=file_id, relationships=relationships) - conn.commit() - - def _save_relationships( - self, - conn: sqlite3.Connection, - file_id: int, - relationships: Optional[List[CodeRelationship]], - ) -> None: - if not relationships: - return - - rows = conn.execute( - "SELECT id, name FROM symbols WHERE file_id=? ORDER BY start_line, id", - (file_id,), - ).fetchall() - - name_to_id: Dict[str, int] = {} - for row in rows: - name = row["name"] - if name not in name_to_id: - name_to_id[name] = int(row["id"]) - - if not name_to_id: - return - - rel_rows: List[Tuple[int, str, str, int, Optional[str]]] = [] - seen: set[tuple[int, str, str, int, Optional[str]]] = set() - - for rel in relationships: - source_id = name_to_id.get(rel.source_symbol) - if source_id is None: - continue - - target = (rel.target_symbol or "").strip() - if not target: - continue - - rel_type = rel.relationship_type.value - source_line = int(rel.source_line) - key = (source_id, target, rel_type, source_line, rel.target_file) - if key in seen: - continue - seen.add(key) - - rel_rows.append((source_id, target, rel_type, source_line, rel.target_file)) - - if not rel_rows: - return - - conn.executemany( - """ - INSERT INTO code_relationships( - source_symbol_id, target_qualified_name, - relationship_type, source_line, target_file - ) - VALUES(?, ?, ?, ?, ?) - """, - rel_rows, - ) - - def _save_merkle_hash(self, conn: sqlite3.Connection, file_id: int, content: str) -> None: - """Upsert a SHA-256 content hash for the given file_id (best-effort).""" - try: - digest = hashlib.sha256(content.encode("utf-8", errors="ignore")).hexdigest() - now = time.time() - conn.execute( - """ - INSERT INTO merkle_hashes(file_id, sha256, updated_at) - VALUES(?, ?, ?) - ON CONFLICT(file_id) DO UPDATE SET - sha256=excluded.sha256, - updated_at=excluded.updated_at - """, - (file_id, digest, now), - ) - except sqlite3.Error: - return - - def add_files_batch( - self, files: List[Tuple[str, Path, str, str, Optional[List[Symbol]]]] - ) -> int: - """Add multiple files in a single transaction. - - Args: - files: List of (name, full_path, content, language, symbols) tuples - - Returns: - Number of files added - - Raises: - StorageError: If batch operation fails - """ - with self._lock: - conn = self._get_connection() - count = 0 - - try: - conn.execute("BEGIN") - - for name, full_path, content, language, symbols in files: - full_path_str = str(Path(full_path).resolve()) - mtime = Path(full_path_str).stat().st_mtime if Path(full_path_str).exists() else None - line_count = content.count('\n') + 1 - - conn.execute( - """ - INSERT INTO files(name, full_path, language, content, mtime, line_count) - VALUES(?, ?, ?, ?, ?, ?) - ON CONFLICT(full_path) DO UPDATE SET - name=excluded.name, - language=excluded.language, - content=excluded.content, - mtime=excluded.mtime, - line_count=excluded.line_count - """, - (name, full_path_str, language, content, mtime, line_count), - ) - - row = conn.execute("SELECT id FROM files WHERE full_path=?", (full_path_str,)).fetchone() - if not row: - raise StorageError(f"Failed to retrieve file_id for {full_path_str}") - - file_id = int(row["id"]) - count += 1 - - conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,)) - if symbols: - # Insert symbols - symbol_rows = [] - for s in symbols: - symbol_rows.append( - (file_id, s.name, s.kind, s.range[0], s.range[1]) - ) - - conn.executemany( - """ - INSERT INTO symbols(file_id, name, kind, start_line, end_line) - VALUES(?, ?, ?, ?, ?) - """, - symbol_rows, - ) - - self._save_merkle_hash(conn, file_id=file_id, content=content) - - conn.commit() - return count - - except sqlite3.DatabaseError as exc: - conn.rollback() - raise StorageError(f"Batch insert failed: {exc}") from exc - - def remove_file(self, full_path: str | Path) -> bool: - """Remove a file from the index. - - Args: - full_path: Complete source file path - - Returns: - True if file was removed, False if not found - """ - with self._lock: - conn = self._get_connection() - full_path_str = str(Path(full_path).resolve()) - - row = conn.execute("SELECT id FROM files WHERE full_path=?", (full_path_str,)).fetchone() - if not row: - return False - - file_id = int(row["id"]) - conn.execute("DELETE FROM files WHERE id=?", (file_id,)) - conn.commit() - self._maybe_delete_global_symbols(full_path_str) - return True - - def get_file(self, full_path: str | Path) -> Optional[FileEntry]: - """Get file metadata. - - Args: - full_path: Complete source file path - - Returns: - FileEntry if found, None otherwise - """ - with self._lock: - conn = self._get_connection() - full_path_str = str(Path(full_path).resolve()) - - row = conn.execute( - """ - SELECT id, name, full_path, language, mtime, line_count - FROM files WHERE full_path=? - """, - (full_path_str,), - ).fetchone() - - if not row: - return None - - return FileEntry( - id=int(row["id"]), - name=row["name"], - full_path=Path(row["full_path"]), - language=row["language"], - mtime=float(row["mtime"]) if row["mtime"] else 0.0, - line_count=int(row["line_count"]) if row["line_count"] else 0, - ) - - def get_file_mtime(self, full_path: str | Path) -> Optional[float]: - """Get stored modification time for a file. - - Args: - full_path: Complete source file path - - Returns: - Modification time as float, or None if not found - """ - with self._lock: - conn = self._get_connection() - full_path_str = str(Path(full_path).resolve()) - - row = conn.execute( - "SELECT mtime FROM files WHERE full_path=?", (full_path_str,) - ).fetchone() - - return float(row["mtime"]) if row and row["mtime"] else None - - def needs_reindex(self, full_path: str | Path) -> bool: - """Check if a file needs reindexing. - - Default behavior uses mtime comparison (with 1ms tolerance). - - When `Config.enable_merkle_detection` is enabled and Merkle metadata is - available, uses SHA-256 content hash comparison (with mtime as a fast - path to avoid hashing unchanged files). - - Args: - full_path: Complete source file path - - Returns: - True if file should be reindexed (new, modified, or missing from index) - """ - full_path_obj = Path(full_path).resolve() - if not full_path_obj.exists(): - return False # File doesn't exist, skip indexing - - # Get current filesystem mtime - try: - current_mtime = full_path_obj.stat().st_mtime - except OSError: - return False # Can't read file stats, skip - - MTIME_TOLERANCE = 0.001 - - # Fast path: mtime-only mode (default / backward-compatible) - if self._config is None or not getattr(self._config, "enable_merkle_detection", False): - stored_mtime = self.get_file_mtime(full_path_obj) - if stored_mtime is None: - return True - return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE - - full_path_str = str(full_path_obj) - - # Hash-based change detection (best-effort, falls back to mtime when metadata missing) - with self._lock: - conn = self._get_connection() - try: - row = conn.execute( - """ - SELECT f.id AS file_id, f.mtime AS mtime, mh.sha256 AS sha256 - FROM files f - LEFT JOIN merkle_hashes mh ON mh.file_id = f.id - WHERE f.full_path=? - """, - (full_path_str,), - ).fetchone() - except sqlite3.Error: - row = None - - if row is None: - return True - - stored_mtime = float(row["mtime"]) if row["mtime"] else None - stored_hash = row["sha256"] if row["sha256"] else None - file_id = int(row["file_id"]) - - # Missing Merkle data: fall back to mtime - if stored_hash is None: - if stored_mtime is None: - return True - return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE - - # If mtime is unchanged within tolerance, assume unchanged without hashing. - if stored_mtime is not None and abs(current_mtime - stored_mtime) <= MTIME_TOLERANCE: - return False - - try: - current_text = full_path_obj.read_text(encoding="utf-8", errors="ignore") - current_hash = hashlib.sha256(current_text.encode("utf-8", errors="ignore")).hexdigest() - except OSError: - return False - - if current_hash == stored_hash: - # Content unchanged, but mtime drifted: update stored mtime to avoid repeated hashing. - with self._lock: - conn = self._get_connection() - conn.execute("UPDATE files SET mtime=? WHERE id=?", (current_mtime, file_id)) - conn.commit() - return False - - return True - - def get_merkle_root_hash(self) -> Optional[str]: - """Return the stored Merkle root hash for this directory index (if present).""" - with self._lock: - conn = self._get_connection() - try: - row = conn.execute( - "SELECT root_hash FROM merkle_state WHERE id=1" - ).fetchone() - except sqlite3.Error: - return None - - return row["root_hash"] if row and row["root_hash"] else None - - def update_merkle_root(self) -> Optional[str]: - """Compute and persist the Merkle root hash for this directory index. - - The root hash includes: - - Direct file hashes from `merkle_hashes` - - Direct subdirectory root hashes (read from child `_index.db` files) - """ - if self._config is None or not getattr(self._config, "enable_merkle_detection", False): - return None - - with self._lock: - conn = self._get_connection() - try: - file_rows = conn.execute( - """ - SELECT f.name AS name, mh.sha256 AS sha256 - FROM files f - LEFT JOIN merkle_hashes mh ON mh.file_id = f.id - ORDER BY f.name - """ - ).fetchall() - - subdir_rows = conn.execute( - "SELECT name, index_path FROM subdirs ORDER BY name" - ).fetchall() - except sqlite3.Error as exc: - self.logger.debug("Failed to compute merkle root: %s", exc) - return None - - items: List[str] = [] - - for row in file_rows: - name = row["name"] - sha = (row["sha256"] or "").strip() - items.append(f"f:{name}:{sha}") - - def read_child_root(index_path: str) -> str: - try: - with sqlite3.connect(index_path) as child_conn: - child_conn.row_factory = sqlite3.Row - child_row = child_conn.execute( - "SELECT root_hash FROM merkle_state WHERE id=1" - ).fetchone() - return child_row["root_hash"] if child_row and child_row["root_hash"] else "" - except Exception: - return "" - - for row in subdir_rows: - name = row["name"] - index_path = row["index_path"] - child_hash = read_child_root(index_path) if index_path else "" - items.append(f"d:{name}:{child_hash}") - - root_hash = hashlib.sha256("\n".join(items).encode("utf-8", errors="ignore")).hexdigest() - now = time.time() - - with self._lock: - conn = self._get_connection() - try: - conn.execute( - """ - INSERT INTO merkle_state(id, root_hash, updated_at) - VALUES(1, ?, ?) - ON CONFLICT(id) DO UPDATE SET - root_hash=excluded.root_hash, - updated_at=excluded.updated_at - """, - (root_hash, now), - ) - conn.commit() - except sqlite3.Error as exc: - self.logger.debug("Failed to persist merkle root: %s", exc) - return None - - return root_hash - - def add_file_incremental( - self, - name: str, - full_path: str | Path, - content: str, - language: str, - symbols: Optional[List[Symbol]] = None, - relationships: Optional[List[CodeRelationship]] = None, - ) -> Optional[int]: - """Add or update a file only if it has changed (incremental indexing). - - Checks mtime before indexing to skip unchanged files. - - Args: - name: Filename without path - full_path: Complete source file path - content: File content for indexing - language: Programming language identifier - symbols: List of Symbol objects from the file - relationships: Optional list of CodeRelationship edges from this file - - Returns: - Database file_id if indexed, None if skipped (unchanged) - - Raises: - StorageError: If database operations fail - """ - # Check if reindexing is needed - if not self.needs_reindex(full_path): - return None # Skip unchanged file - - # File changed or new, perform full indexing - return self.add_file(name, full_path, content, language, symbols, relationships) - - def cleanup_deleted_files(self, source_dir: Path) -> int: - """Remove indexed files that no longer exist in the source directory. - - Scans the source directory and removes database entries for deleted files. - - Args: - source_dir: Source directory to scan - - Returns: - Number of deleted file entries removed - - Raises: - StorageError: If cleanup operations fail - """ - with self._lock: - conn = self._get_connection() - source_dir = source_dir.resolve() - - try: - # Get all indexed file paths - rows = conn.execute("SELECT full_path FROM files").fetchall() - indexed_paths = {row["full_path"] for row in rows} - - # Build set of existing files in source directory - existing_paths = set() - for file_path in source_dir.rglob("*"): - if file_path.is_file(): - existing_paths.add(str(file_path.resolve())) - - # Find orphaned entries (indexed but no longer exist) - deleted_paths = indexed_paths - existing_paths - - # Remove orphaned entries - deleted_count = 0 - for deleted_path in deleted_paths: - conn.execute("DELETE FROM files WHERE full_path=?", (deleted_path,)) - deleted_count += 1 - self._maybe_delete_global_symbols(deleted_path) - - if deleted_count > 0: - conn.commit() - - return deleted_count - - except Exception as exc: - conn.rollback() - raise StorageError(f"Failed to cleanup deleted files: {exc}") from exc - - def list_files(self) -> List[FileEntry]: - """List all files in current directory. - - Returns: - List of FileEntry objects - """ - with self._lock: - conn = self._get_connection() - rows = conn.execute( - """ - SELECT id, name, full_path, language, mtime, line_count - FROM files - ORDER BY name - """ - ).fetchall() - - return [ - FileEntry( - id=int(row["id"]), - name=row["name"], - full_path=Path(row["full_path"]), - language=row["language"], - mtime=float(row["mtime"]) if row["mtime"] else 0.0, - line_count=int(row["line_count"]) if row["line_count"] else 0, - ) - for row in rows - ] - - def file_count(self) -> int: - """Get number of files in current directory. - - Returns: - File count - """ - with self._lock: - conn = self._get_connection() - row = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone() - return int(row["c"]) if row else 0 - - # === Semantic Metadata === - - def add_semantic_metadata( - self, - file_id: int, - summary: str, - keywords: List[str], - purpose: str, - llm_tool: str - ) -> None: - """Add or update semantic metadata for a file. - - Args: - file_id: File ID from files table - summary: LLM-generated summary - keywords: List of keywords - purpose: Purpose/role of the file - llm_tool: Tool used to generate metadata (gemini/qwen) - """ - with self._lock: - conn = self._get_connection() - - import time - - generated_at = time.time() - - # Write to semantic_metadata table (without keywords column) - conn.execute( - """ - INSERT INTO semantic_metadata(file_id, summary, purpose, llm_tool, generated_at) - VALUES(?, ?, ?, ?, ?) - ON CONFLICT(file_id) DO UPDATE SET - summary=excluded.summary, - purpose=excluded.purpose, - llm_tool=excluded.llm_tool, - generated_at=excluded.generated_at - """, - (file_id, summary, purpose, llm_tool, generated_at), - ) - - # Write to normalized keywords tables for optimized search - # First, remove existing keyword associations - conn.execute("DELETE FROM file_keywords WHERE file_id = ?", (file_id,)) - - # Then add new keywords - for keyword in keywords: - keyword = keyword.strip() - if not keyword: - continue - - # Insert keyword if it doesn't exist - conn.execute( - "INSERT OR IGNORE INTO keywords(keyword) VALUES(?)", - (keyword,) - ) - - # Get keyword_id - row = conn.execute( - "SELECT id FROM keywords WHERE keyword = ?", - (keyword,) - ).fetchone() - - if row: - keyword_id = row["id"] - # Link file to keyword - conn.execute( - "INSERT OR IGNORE INTO file_keywords(file_id, keyword_id) VALUES(?, ?)", - (file_id, keyword_id) - ) - - conn.commit() - - def get_semantic_metadata(self, file_id: int) -> Optional[Dict[str, Any]]: - """Get semantic metadata for a file. - - Args: - file_id: File ID from files table - - Returns: - Dict with summary, keywords, purpose, llm_tool, generated_at, or None if not found - """ - with self._lock: - conn = self._get_connection() - - # Get semantic metadata (without keywords column) - row = conn.execute( - """ - SELECT summary, purpose, llm_tool, generated_at - FROM semantic_metadata WHERE file_id=? - """, - (file_id,), - ).fetchone() - - if not row: - return None - - # Get keywords from normalized file_keywords table - keyword_rows = conn.execute( - """ - SELECT k.keyword - FROM file_keywords fk - JOIN keywords k ON fk.keyword_id = k.id - WHERE fk.file_id = ? - ORDER BY k.keyword - """, - (file_id,), - ).fetchall() - - keywords = [kw["keyword"] for kw in keyword_rows] - - return { - "summary": row["summary"], - "keywords": keywords, - "purpose": row["purpose"], - "llm_tool": row["llm_tool"], - "generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0, - } - - def get_files_without_semantic(self) -> List[FileEntry]: - """Get all files that don't have semantic metadata. - - Returns: - List of FileEntry objects without semantic metadata - """ - with self._lock: - conn = self._get_connection() - - rows = conn.execute( - """ - SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count - FROM files f - LEFT JOIN semantic_metadata sm ON f.id = sm.file_id - WHERE sm.id IS NULL - ORDER BY f.name - """ - ).fetchall() - - return [ - FileEntry( - id=int(row["id"]), - name=row["name"], - full_path=Path(row["full_path"]), - language=row["language"], - mtime=float(row["mtime"]) if row["mtime"] else 0.0, - line_count=int(row["line_count"]) if row["line_count"] else 0, - ) - for row in rows - ] - - def search_semantic_keywords(self, keyword: str, use_normalized: bool = True) -> List[Tuple[FileEntry, List[str]]]: - """Search files by semantic keywords. - - Args: - keyword: Keyword to search for (case-insensitive) - use_normalized: Use optimized normalized tables (default: True) - - Returns: - List of (FileEntry, keywords) tuples where keyword matches - """ - with self._lock: - conn = self._get_connection() - - if use_normalized: - # Optimized query using normalized tables with indexed lookup - # Use prefix search (keyword%) for better index utilization - keyword_pattern = f"{keyword}%" - - rows = conn.execute( - """ - SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count, - GROUP_CONCAT(k.keyword, ',') as keywords - FROM files f - JOIN file_keywords fk ON f.id = fk.file_id - JOIN keywords k ON fk.keyword_id = k.id - WHERE k.keyword LIKE ? COLLATE NOCASE - GROUP BY f.id, f.name, f.full_path, f.language, f.mtime, f.line_count - ORDER BY f.name - """, - (keyword_pattern,), - ).fetchall() - - results = [] - for row in rows: - file_entry = FileEntry( - id=int(row["id"]), - name=row["name"], - full_path=Path(row["full_path"]), - language=row["language"], - mtime=float(row["mtime"]) if row["mtime"] else 0.0, - line_count=int(row["line_count"]) if row["line_count"] else 0, - ) - keywords = row["keywords"].split(',') if row["keywords"] else [] - results.append((file_entry, keywords)) - - return results - - else: - # Fallback using normalized tables with contains matching (slower but more flexible) - keyword_pattern = f"%{keyword}%" - - rows = conn.execute( - """ - SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count, - GROUP_CONCAT(k.keyword, ',') as keywords - FROM files f - JOIN file_keywords fk ON f.id = fk.file_id - JOIN keywords k ON fk.keyword_id = k.id - WHERE k.keyword LIKE ? COLLATE NOCASE - GROUP BY f.id, f.name, f.full_path, f.language, f.mtime, f.line_count - ORDER BY f.name - """, - (keyword_pattern,), - ).fetchall() - - results = [] - for row in rows: - file_entry = FileEntry( - id=int(row["id"]), - name=row["name"], - full_path=Path(row["full_path"]), - language=row["language"], - mtime=float(row["mtime"]) if row["mtime"] else 0.0, - line_count=int(row["line_count"]) if row["line_count"] else 0, - ) - keywords = row["keywords"].split(',') if row["keywords"] else [] - results.append((file_entry, keywords)) - - return results - - def list_semantic_metadata( - self, - offset: int = 0, - limit: int = 50, - llm_tool: Optional[str] = None, - ) -> Tuple[List[Dict[str, Any]], int]: - """List all semantic metadata with file information. - - Args: - offset: Number of records to skip (for pagination) - limit: Maximum records to return (max 100) - llm_tool: Optional filter by LLM tool used - - Returns: - Tuple of (list of metadata dicts, total count) - """ - with self._lock: - conn = self._get_connection() - - # Query semantic metadata without keywords column - base_query = """ - SELECT f.id as file_id, f.name as file_name, f.full_path, - f.language, f.line_count, - sm.summary, sm.purpose, - sm.llm_tool, sm.generated_at - FROM files f - JOIN semantic_metadata sm ON f.id = sm.file_id - """ - count_query = """ - SELECT COUNT(*) as total - FROM files f - JOIN semantic_metadata sm ON f.id = sm.file_id - """ - - params: List[Any] = [] - if llm_tool: - base_query += " WHERE sm.llm_tool = ?" - count_query += " WHERE sm.llm_tool = ?" - params.append(llm_tool) - - base_query += " ORDER BY sm.generated_at DESC LIMIT ? OFFSET ?" - params.extend([min(limit, 100), offset]) - - count_params = [llm_tool] if llm_tool else [] - total_row = conn.execute(count_query, count_params).fetchone() - total = int(total_row["total"]) if total_row else 0 - - rows = conn.execute(base_query, params).fetchall() - - results = [] - for row in rows: - file_id = int(row["file_id"]) - - # Get keywords from normalized file_keywords table - keyword_rows = conn.execute( - """ - SELECT k.keyword - FROM file_keywords fk - JOIN keywords k ON fk.keyword_id = k.id - WHERE fk.file_id = ? - ORDER BY k.keyword - """, - (file_id,), - ).fetchall() - - keywords = [kw["keyword"] for kw in keyword_rows] - - results.append({ - "file_id": file_id, - "file_name": row["file_name"], - "full_path": row["full_path"], - "language": row["language"], - "line_count": int(row["line_count"]) if row["line_count"] else 0, - "summary": row["summary"], - "keywords": keywords, - "purpose": row["purpose"], - "llm_tool": row["llm_tool"], - "generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0, - }) - - return results, total - - # === Subdirectory Links === - - def register_subdir( - self, - name: str, - index_path: str | Path, - files_count: int = 0, - direct_files: int = 0, - ) -> None: - """Register or update a subdirectory link. - - Args: - name: Subdirectory name - index_path: Path to subdirectory's _index.db - files_count: Total files recursively - direct_files: Deprecated parameter (no longer used) - """ - with self._lock: - conn = self._get_connection() - index_path_str = str(Path(index_path).resolve()) - - import time - last_updated = time.time() - - # Note: direct_files parameter is deprecated but kept for backward compatibility - conn.execute( - """ - INSERT INTO subdirs(name, index_path, files_count, last_updated) - VALUES(?, ?, ?, ?) - ON CONFLICT(name) DO UPDATE SET - index_path=excluded.index_path, - files_count=excluded.files_count, - last_updated=excluded.last_updated - """, - (name, index_path_str, files_count, last_updated), - ) - conn.commit() - - def unregister_subdir(self, name: str) -> bool: - """Remove a subdirectory link. - - Args: - name: Subdirectory name - - Returns: - True if removed, False if not found - """ - with self._lock: - conn = self._get_connection() - row = conn.execute("SELECT id FROM subdirs WHERE name=?", (name,)).fetchone() - if not row: - return False - - conn.execute("DELETE FROM subdirs WHERE name=?", (name,)) - conn.commit() - return True - - def get_subdirs(self) -> List[SubdirLink]: - """Get all subdirectory links. - - Returns: - List of SubdirLink objects - """ - with self._lock: - conn = self._get_connection() - rows = conn.execute( - """ - SELECT id, name, index_path, files_count, last_updated - FROM subdirs - ORDER BY name - """ - ).fetchall() - - return [ - SubdirLink( - id=int(row["id"]), - name=row["name"], - index_path=Path(row["index_path"]), - files_count=int(row["files_count"]) if row["files_count"] else 0, - last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0, - ) - for row in rows - ] - - def get_subdir(self, name: str) -> Optional[SubdirLink]: - """Get a specific subdirectory link. - - Args: - name: Subdirectory name - - Returns: - SubdirLink if found, None otherwise - """ - with self._lock: - conn = self._get_connection() - row = conn.execute( - """ - SELECT id, name, index_path, files_count, last_updated - FROM subdirs WHERE name=? - """, - (name,), - ).fetchone() - - if not row: - return None - - return SubdirLink( - id=int(row["id"]), - name=row["name"], - index_path=Path(row["index_path"]), - files_count=int(row["files_count"]) if row["files_count"] else 0, - last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0, - ) - - def update_subdir_stats( - self, name: str, files_count: int, direct_files: Optional[int] = None - ) -> None: - """Update subdirectory statistics. - - Args: - name: Subdirectory name - files_count: Total files recursively - direct_files: Deprecated parameter (no longer used) - """ - with self._lock: - conn = self._get_connection() - import time - last_updated = time.time() - - # Note: direct_files parameter is deprecated but kept for backward compatibility - conn.execute( - """ - UPDATE subdirs - SET files_count=?, last_updated=? - WHERE name=? - """, - (files_count, last_updated, name), - ) - conn.commit() - - # === Search === - - @staticmethod - def _enhance_fts_query(query: str) -> str: - """Enhance FTS5 query to support prefix matching for simple queries. - - For simple single-word or multi-word queries without FTS5 operators, - automatically adds prefix wildcard (*) to enable partial matching. - - Examples: - "loadPack" -> "loadPack*" - "load package" -> "load* package*" - "load*" -> "load*" (already has wildcard, unchanged) - "NOT test" -> "NOT test" (has FTS operator, unchanged) - - Args: - query: Original FTS5 query string - - Returns: - Enhanced query string with prefix wildcards for simple queries - """ - # Don't modify if query already contains FTS5 operators or wildcards - if any(op in query.upper() for op in [' AND ', ' OR ', ' NOT ', ' NEAR ', '*', '"']): - return query - - # For simple queries, add prefix wildcard to each word - words = query.split() - enhanced_words = [f"{word}*" if not word.endswith('*') else word for word in words] - return ' '.join(enhanced_words) - - def _find_match_lines(self, content: str, query: str) -> List[int]: - """Find line numbers where query terms match. - - Args: - content: File content - query: Search query (FTS5 format) - - Returns: - List of 1-based line numbers containing matches - """ - # Extract search terms from FTS query (remove operators) - terms = re.findall(r'["\']([^"\']+)["\']|(\w+)', query) - search_terms = [t[0] or t[1] for t in terms if t[0] or t[1]] - # Filter out FTS operators - fts_operators = {'AND', 'OR', 'NOT', 'NEAR'} - search_terms = [t for t in search_terms if t.upper() not in fts_operators] - - if not search_terms: - return [1] # Default to first line - - lines = content.split('\n') - match_lines = [] - - for i, line in enumerate(lines, 1): - line_lower = line.lower() - for term in search_terms: - # Handle wildcard suffix - term_clean = term.rstrip('*').lower() - if term_clean and term_clean in line_lower: - match_lines.append(i) - break - - return match_lines if match_lines else [1] - - def _find_containing_symbol( - self, conn: sqlite3.Connection, file_id: int, line_num: int - ) -> Optional[Tuple[int, int, str, str]]: - """Find the symbol that contains the given line number. - - Args: - conn: Database connection - file_id: File ID in database - line_num: 1-based line number - - Returns: - Tuple of (start_line, end_line, symbol_name, symbol_kind) or None - """ - row = conn.execute( - """ - SELECT start_line, end_line, name, kind - FROM symbols - WHERE file_id = ? AND start_line <= ? AND end_line >= ? - ORDER BY (end_line - start_line) ASC - LIMIT 1 - """, - (file_id, line_num, line_num), - ).fetchone() - - if row: - return (row["start_line"], row["end_line"], row["name"], row["kind"]) - return None - - def _extract_code_block( - self, - content: str, - start_line: int, - end_line: int, - match_line: Optional[int] = None, - context_lines: int = 5, - ) -> Tuple[str, int, int]: - """Extract code block from content. - - If start_line/end_line are provided (from symbol), use them. - Otherwise, extract context around match_line. - - Args: - content: Full file content - start_line: 1-based start line (from symbol or calculated) - end_line: 1-based end line (from symbol or calculated) - match_line: 1-based line where match occurred (for context extraction) - context_lines: Number of lines before/after match when no symbol - - Returns: - Tuple of (code_block, actual_start_line, actual_end_line) - """ - lines = content.split('\n') - total_lines = len(lines) - - # Clamp to valid range - start_line = max(1, start_line) - end_line = min(total_lines, end_line) - - # Extract block (convert to 0-based index) - block_lines = lines[start_line - 1:end_line] - block_content = '\n'.join(block_lines) - - return block_content, start_line, end_line - - def _batch_fetch_symbols( - self, conn: sqlite3.Connection, file_ids: List[int] - ) -> Dict[int, List[Tuple[int, int, str, str]]]: - """Batch fetch all symbols for multiple files in a single query. - - Args: - conn: Database connection - file_ids: List of file IDs to fetch symbols for - - Returns: - Dictionary mapping file_id to list of (start_line, end_line, name, kind) tuples - """ - if not file_ids: - return {} - - # Build placeholder string for IN clause - placeholders = ','.join('?' for _ in file_ids) - rows = conn.execute( - f""" - SELECT file_id, start_line, end_line, name, kind - FROM symbols - WHERE file_id IN ({placeholders}) - ORDER BY file_id, (end_line - start_line) ASC - """, - file_ids, - ).fetchall() - - # Organize symbols by file_id - symbols_by_file: Dict[int, List[Tuple[int, int, str, str]]] = {fid: [] for fid in file_ids} - for row in rows: - symbols_by_file[row["file_id"]].append( - (row["start_line"], row["end_line"], row["name"], row["kind"]) - ) - return symbols_by_file - - def _find_containing_symbol_from_cache( - self, symbols: List[Tuple[int, int, str, str]], line_num: int - ) -> Optional[Tuple[int, int, str, str]]: - """Find the smallest symbol containing the given line number from cached symbols. - - Args: - symbols: List of (start_line, end_line, name, kind) tuples, sorted by size - line_num: 1-based line number - - Returns: - Tuple of (start_line, end_line, symbol_name, symbol_kind) or None - """ - for start_line, end_line, name, kind in symbols: - if start_line <= line_num <= end_line: - return (start_line, end_line, name, kind) - return None - - def _generate_centered_excerpt( - self, content: str, match_line: int, start_line: int, end_line: int, max_chars: int = 200 - ) -> str: - """Generate excerpt centered around the match line. - - Args: - content: Full file content - match_line: 1-based line where match occurred - start_line: 1-based start line of the code block - end_line: 1-based end line of the code block - max_chars: Maximum characters for excerpt - - Returns: - Excerpt string centered around the match - """ - lines = content.split('\n') - total_lines = len(lines) - - # Ensure match_line is within bounds - match_line = max(1, min(match_line, total_lines)) - - # Calculate context window (2 lines before, 2 lines after the match) - ctx_start = max(start_line, match_line - 2) - ctx_end = min(end_line, match_line + 2) - - # Extract and join lines - excerpt_lines = lines[ctx_start - 1:ctx_end] - excerpt = '\n'.join(excerpt_lines) - - # Truncate if too long - if len(excerpt) > max_chars: - excerpt = excerpt[:max_chars] + "..." - - return excerpt - - def _search_internal( - self, - query: str, - fts_table: str, - limit: int = 20, - return_full_content: bool = False, - context_lines: int = 10, - ) -> List[SearchResult]: - """Internal unified search implementation for all FTS modes. - - Optimizations: - - Fast path: Direct FTS query with snippet() for location-only results - - Full content path: Batch fetch symbols to eliminate N+1 queries - - Centered excerpt generation for better context - - Args: - query: FTS5 query string - fts_table: FTS table name ('files_fts_exact' or 'files_fts_fuzzy') - limit: Maximum results to return - return_full_content: If True, include full code block in content field - context_lines: Lines of context when no symbol contains the match - - Returns: - List of SearchResult objects - """ - with self._lock: - conn = self._get_connection() - - # Fast path: location-only results (no content processing) - if not return_full_content: - try: - rows = conn.execute( - f""" - SELECT rowid, full_path, bm25({fts_table}) AS rank, - snippet({fts_table}, 2, '', '', '...', 30) AS excerpt - FROM {fts_table} - WHERE {fts_table} MATCH ? - ORDER BY rank - LIMIT ? - """, - (query, limit), - ).fetchall() - except sqlite3.DatabaseError as exc: - raise StorageError(f"FTS search failed: {exc}") from exc - - results: List[SearchResult] = [] - for row in rows: - rank = float(row["rank"]) if row["rank"] is not None else 0.0 - score = abs(rank) if rank < 0 else 0.0 - results.append( - SearchResult( - path=row["full_path"], - score=score, - excerpt=row["excerpt"], - ) - ) - return results - - # Full content path with batch optimization - # Step 1: Get file_ids and ranks (lightweight query) - try: - id_rows = conn.execute( - f""" - SELECT rowid AS file_id, bm25({fts_table}) AS rank - FROM {fts_table} - WHERE {fts_table} MATCH ? - ORDER BY rank - LIMIT ? - """, - (query, limit), - ).fetchall() - except sqlite3.DatabaseError as exc: - raise StorageError(f"FTS search failed: {exc}") from exc - - if not id_rows: - return [] - - file_ids = [row["file_id"] for row in id_rows] - ranks_by_id = {row["file_id"]: row["rank"] for row in id_rows} - - # Step 2: Batch fetch all symbols for matched files (eliminates N+1) - symbols_by_file = self._batch_fetch_symbols(conn, file_ids) - - # Step 3: Process each file on-demand (reduces memory) - results: List[SearchResult] = [] - for file_id in file_ids: - # Fetch file content on-demand - file_row = conn.execute( - "SELECT full_path, content FROM files WHERE id = ?", - (file_id,), - ).fetchone() - - if not file_row: - continue - - file_path = file_row["full_path"] - content = file_row["content"] or "" - rank = ranks_by_id.get(file_id, 0.0) - score = abs(rank) if rank < 0 else 0.0 - - # Find matching lines - match_lines = self._find_match_lines(content, query) - first_match_line = match_lines[0] if match_lines else 1 - - # Find symbol from cached symbols (no extra SQL query) - file_symbols = symbols_by_file.get(file_id, []) - symbol_info = self._find_containing_symbol_from_cache(file_symbols, first_match_line) - - if symbol_info: - start_line, end_line, symbol_name, symbol_kind = symbol_info - else: - # No symbol found, use context around match - lines = content.split('\n') - total_lines = len(lines) - start_line = max(1, first_match_line - context_lines) - end_line = min(total_lines, first_match_line + context_lines) - symbol_name = None - symbol_kind = None - - # Extract code block - block_content, start_line, end_line = self._extract_code_block( - content, start_line, end_line - ) - - # Generate centered excerpt (improved quality) - excerpt = self._generate_centered_excerpt( - content, first_match_line, start_line, end_line - ) - - results.append( - SearchResult( - path=file_path, - score=score, - excerpt=excerpt, - content=block_content, - start_line=start_line, - end_line=end_line, - symbol_name=symbol_name, - symbol_kind=symbol_kind, - ) - ) - return results - - - def search_fts( - self, - query: str, - limit: int = 20, - enhance_query: bool = False, - return_full_content: bool = False, - context_lines: int = 10, - ) -> List[SearchResult]: - """Full-text search in current directory files. - - Uses files_fts_exact (unicode61 tokenizer) for exact token matching. - For fuzzy/substring search, use search_fts_fuzzy() instead. - - Best Practice (from industry analysis of Codanna/Code-Index-MCP): - - Default: Respects exact user input without modification - - Users can manually add wildcards (e.g., "loadPack*") for prefix matching - - Automatic enhancement (enhance_query=True) is NOT recommended as it can - violate user intent and bring unwanted noise in results - - Args: - query: FTS5 query string - limit: Maximum results to return - enhance_query: If True, automatically add prefix wildcards for simple queries. - Default False to respect exact user input. - return_full_content: If True, include full code block in content field. - Default False for fast location-only results. - context_lines: Lines of context when no symbol contains the match - - Returns: - List of SearchResult objects (location-only by default, with content if requested) - - Raises: - StorageError: If FTS search fails - """ - final_query = self._enhance_fts_query(query) if enhance_query else query - return self._search_internal( - query=final_query, - fts_table='files_fts_exact', - limit=limit, - return_full_content=return_full_content, - context_lines=context_lines, - ) - - def search_fts_exact( - self, - query: str, - limit: int = 20, - return_full_content: bool = False, - context_lines: int = 10, - ) -> List[SearchResult]: - """Full-text search using exact token matching. - - Args: - query: FTS5 query string - limit: Maximum results to return - return_full_content: If True, include full code block in content field. - Default False for fast location-only results. - context_lines: Lines of context when no symbol contains the match - - Returns: - List of SearchResult objects (location-only by default, with content if requested) - - Raises: - StorageError: If FTS search fails - """ - return self._search_internal( - query=query, - fts_table='files_fts_exact', - limit=limit, - return_full_content=return_full_content, - context_lines=context_lines, - ) - - def search_fts_fuzzy( - self, - query: str, - limit: int = 20, - return_full_content: bool = False, - context_lines: int = 10, - ) -> List[SearchResult]: - """Full-text search using fuzzy/substring matching. - - Args: - query: FTS5 query string - limit: Maximum results to return - return_full_content: If True, include full code block in content field. - Default False for fast location-only results. - context_lines: Lines of context when no symbol contains the match - - Returns: - List of SearchResult objects (location-only by default, with content if requested) - - Raises: - StorageError: If FTS search fails - """ - return self._search_internal( - query=query, - fts_table='files_fts_fuzzy', - limit=limit, - return_full_content=return_full_content, - context_lines=context_lines, - ) - - def search_files_only(self, query: str, limit: int = 20) -> List[str]: - """Fast FTS search returning only file paths (no snippet generation). - - Optimized for when only file paths are needed, skipping expensive - snippet() function call. - - Args: - query: FTS5 query string - limit: Maximum results to return - - Returns: - List of file paths as strings - - Raises: - StorageError: If FTS search fails - """ - with self._lock: - conn = self._get_connection() - try: - rows = conn.execute( - """ - SELECT full_path - FROM files_fts - WHERE files_fts MATCH ? - ORDER BY bm25(files_fts) - LIMIT ? - """, - (query, limit), - ).fetchall() - except sqlite3.DatabaseError as exc: - raise StorageError(f"FTS search failed: {exc}") from exc - - return [row["full_path"] for row in rows] - - def search_symbols( - self, name: str, kind: Optional[str] = None, limit: int = 50, prefix_mode: bool = True - ) -> List[Symbol]: - """Search symbols by name pattern. - - Args: - name: Symbol name pattern - kind: Optional symbol kind filter - limit: Maximum results to return - prefix_mode: If True, use prefix search (faster with index); - If False, use substring search (slower) - - Returns: - List of Symbol objects - """ - # Prefix search is much faster as it can use index - if prefix_mode: - pattern = f"{name}%" - else: - pattern = f"%{name}%" - - with self._lock: - conn = self._get_connection() - if kind: - rows = conn.execute( - """ - SELECT s.name, s.kind, s.start_line, s.end_line, f.full_path - FROM symbols s - JOIN files f ON s.file_id = f.id - WHERE s.name LIKE ? AND s.kind=? - ORDER BY s.name - LIMIT ? - """, - (pattern, kind, limit), - ).fetchall() - else: - rows = conn.execute( - """ - SELECT s.name, s.kind, s.start_line, s.end_line, f.full_path - FROM symbols s - JOIN files f ON s.file_id = f.id - WHERE s.name LIKE ? - ORDER BY s.name - LIMIT ? - """, - (pattern, limit), - ).fetchall() - - return [ - Symbol( - name=row["name"], - kind=row["kind"], - range=(row["start_line"], row["end_line"]), - file=row["full_path"], - ) - for row in rows - ] - - def get_file_symbols(self, file_path: str | Path) -> List[Symbol]: - """Get all symbols in a specific file, sorted by start_line. - - Args: - file_path: Full path to the file - - Returns: - List of Symbol objects sorted by start_line - """ - file_path_str = str(Path(file_path).resolve()) - - with self._lock: - conn = self._get_connection() - # First get the file_id - file_row = conn.execute( - "SELECT id FROM files WHERE full_path=?", - (file_path_str,), - ).fetchone() - - if not file_row: - return [] - - file_id = int(file_row["id"]) - - rows = conn.execute( - """ - SELECT s.name, s.kind, s.start_line, s.end_line - FROM symbols s - WHERE s.file_id=? - ORDER BY s.start_line - """, - (file_id,), - ).fetchall() - - return [ - Symbol( - name=row["name"], - kind=row["kind"], - range=(row["start_line"], row["end_line"]), - file=file_path_str, - ) - for row in rows - ] - - def get_outgoing_calls( - self, - file_path: str | Path, - symbol_name: Optional[str] = None, - ) -> List[Tuple[str, str, int, Optional[str]]]: - """Get outgoing calls from symbols in a file. - - Queries code_relationships table for calls originating from symbols - in the specified file. - - Args: - file_path: Full path to the source file - symbol_name: Optional symbol name to filter by. If None, returns - calls from all symbols in the file. - - Returns: - List of tuples: (target_name, relationship_type, source_line, target_file) - - target_name: Qualified name of the call target - - relationship_type: Type of relationship (e.g., "calls", "imports") - - source_line: Line number where the call occurs - - target_file: Target file path (may be None if unknown) - """ - file_path_str = str(Path(file_path).resolve()) - - with self._lock: - conn = self._get_connection() - # First get the file_id - file_row = conn.execute( - "SELECT id FROM files WHERE full_path=?", - (file_path_str,), - ).fetchone() - - if not file_row: - return [] - - file_id = int(file_row["id"]) - - if symbol_name: - rows = conn.execute( - """ - SELECT cr.target_qualified_name, cr.relationship_type, - cr.source_line, cr.target_file - FROM code_relationships cr - JOIN symbols s ON s.id = cr.source_symbol_id - WHERE s.file_id=? AND s.name=? - ORDER BY cr.source_line - """, - (file_id, symbol_name), - ).fetchall() - else: - rows = conn.execute( - """ - SELECT cr.target_qualified_name, cr.relationship_type, - cr.source_line, cr.target_file - FROM code_relationships cr - JOIN symbols s ON s.id = cr.source_symbol_id - WHERE s.file_id=? - ORDER BY cr.source_line - """, - (file_id,), - ).fetchall() - - return [ - ( - row["target_qualified_name"], - row["relationship_type"], - int(row["source_line"]), - row["target_file"], - ) - for row in rows - ] - - def get_incoming_calls( - self, - target_name: str, - limit: int = 100, - ) -> List[Tuple[str, str, int, str]]: - """Get incoming calls/references to a target symbol. - - Queries code_relationships table for references to the specified - target symbol name. - - Args: - target_name: Name of the target symbol to find references for. - Matches against target_qualified_name (exact match, - suffix match, or contains match). - limit: Maximum number of results to return - - Returns: - List of tuples: (source_symbol_name, relationship_type, source_line, source_file) - - source_symbol_name: Name of the calling symbol - - relationship_type: Type of relationship (e.g., "calls", "imports") - - source_line: Line number where the call occurs - - source_file: Full path to the source file - """ - with self._lock: - conn = self._get_connection() - rows = conn.execute( - """ - SELECT s.name AS source_name, cr.relationship_type, - cr.source_line, f.full_path AS source_file - FROM code_relationships cr - JOIN symbols s ON s.id = cr.source_symbol_id - JOIN files f ON f.id = s.file_id - WHERE cr.target_qualified_name = ? - OR cr.target_qualified_name LIKE ? - OR cr.target_qualified_name LIKE ? - ORDER BY f.full_path, cr.source_line - LIMIT ? - """, - ( - target_name, - f"%.{target_name}", - f"%{target_name}", - limit, - ), - ).fetchall() - - return [ - ( - row["source_name"], - row["relationship_type"], - int(row["source_line"]), - row["source_file"], - ) - for row in rows - ] - - # === Statistics === - - def stats(self) -> Dict[str, Any]: - """Get current directory statistics. - - Returns: - Dictionary containing: - - files: Number of files in this directory - - symbols: Number of symbols - - subdirs: Number of subdirectories - - total_files: Total files including subdirectories - - languages: Dictionary of language counts - """ - with self._lock: - conn = self._get_connection() - - file_count = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone()["c"] - symbol_count = conn.execute("SELECT COUNT(*) AS c FROM symbols").fetchone()["c"] - subdir_count = conn.execute("SELECT COUNT(*) AS c FROM subdirs").fetchone()["c"] - - total_files_row = conn.execute( - "SELECT COALESCE(SUM(files_count), 0) AS total FROM subdirs" - ).fetchone() - total_files = int(file_count) + int(total_files_row["total"] if total_files_row else 0) - - lang_rows = conn.execute( - "SELECT language, COUNT(*) AS c FROM files GROUP BY language ORDER BY c DESC" - ).fetchall() - languages = {row["language"]: int(row["c"]) for row in lang_rows} - - return { - "files": int(file_count), - "symbols": int(symbol_count), - "subdirs": int(subdir_count), - "total_files": total_files, - "languages": languages, - } - - # === Internal Methods === - - def _get_connection(self) -> sqlite3.Connection: - """Get or create database connection with proper configuration. - - Returns: - sqlite3.Connection with WAL mode and foreign keys enabled - """ - if self._conn is None: - self._conn = sqlite3.connect(str(self.db_path), check_same_thread=False) - self._conn.row_factory = sqlite3.Row - self._conn.execute("PRAGMA journal_mode=WAL") - self._conn.execute("PRAGMA synchronous=NORMAL") - self._conn.execute("PRAGMA foreign_keys=ON") - # Memory-mapped I/O for faster reads (30GB limit) - self._conn.execute("PRAGMA mmap_size=30000000000") - return self._conn - - def _maybe_update_global_symbols(self, file_path: str, symbols: List[Symbol]) -> None: - if self._global_index is None: - return - if self._config is not None and not getattr(self._config, "global_symbol_index_enabled", True): - return - try: - self._global_index.update_file_symbols( - file_path=file_path, - symbols=symbols, - index_path=str(self.db_path), - ) - except Exception as exc: - # Global index is an optimization; local directory index remains authoritative. - self.logger.debug("Global symbol index update failed for %s: %s", file_path, exc) - - def _maybe_delete_global_symbols(self, file_path: str) -> None: - if self._global_index is None: - return - if self._config is not None and not getattr(self._config, "global_symbol_index_enabled", True): - return - try: - self._global_index.delete_file_symbols(file_path) - except Exception as exc: - self.logger.debug("Global symbol index delete failed for %s: %s", file_path, exc) - - def _create_schema(self, conn: sqlite3.Connection) -> None: - """Create database schema. - - Args: - conn: Database connection - - Raises: - StorageError: If schema creation fails - """ - try: - # Files table - conn.execute( - """ - CREATE TABLE IF NOT EXISTS files ( - id INTEGER PRIMARY KEY, - name TEXT NOT NULL, - full_path TEXT UNIQUE NOT NULL, - language TEXT, - content TEXT, - mtime REAL, - line_count INTEGER - ) - """ - ) - - # Subdirectories table (v5: removed direct_files) - conn.execute( - """ - CREATE TABLE IF NOT EXISTS subdirs ( - id INTEGER PRIMARY KEY, - name TEXT NOT NULL UNIQUE, - index_path TEXT NOT NULL, - files_count INTEGER DEFAULT 0, - last_updated REAL - ) - """ - ) - - # Symbols table with token metadata - conn.execute( - """ - CREATE TABLE IF NOT EXISTS symbols ( - id INTEGER PRIMARY KEY, - file_id INTEGER REFERENCES files(id) ON DELETE CASCADE, - name TEXT NOT NULL, - kind TEXT NOT NULL, - start_line INTEGER, - end_line INTEGER - ) - """ - ) - - # Dual FTS5 external content tables for exact and fuzzy matching - # files_fts_exact: unicode61 tokenizer for exact token matching - # files_fts_fuzzy: trigram tokenizer (or extended unicode61) for substring/fuzzy matching - from codexlens.storage.sqlite_utils import check_trigram_support - - has_trigram = check_trigram_support(conn) - fuzzy_tokenizer = "trigram" if has_trigram else "unicode61 tokenchars '_-.'" - - # Exact FTS table with unicode61 tokenizer - # Note: tokenchars includes '.' to properly tokenize qualified names like PortRole.FLOW - conn.execute( - """ - CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_exact USING fts5( - name, full_path UNINDEXED, content, - content='files', - content_rowid='id', - tokenize="unicode61 tokenchars '_-.'" - ) - """ - ) - - # Fuzzy FTS table with trigram or extended unicode61 tokenizer - conn.execute( - f""" - CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_fuzzy USING fts5( - name, full_path UNINDEXED, content, - content='files', - content_rowid='id', - tokenize="{fuzzy_tokenizer}" - ) - """ - ) - - # Semantic metadata table (v5: removed keywords column) - conn.execute( - """ - CREATE TABLE IF NOT EXISTS semantic_metadata ( - id INTEGER PRIMARY KEY, - file_id INTEGER UNIQUE REFERENCES files(id) ON DELETE CASCADE, - summary TEXT, - purpose TEXT, - llm_tool TEXT, - generated_at REAL - ) - """ - ) - - # Normalized keywords tables for performance - conn.execute( - """ - CREATE TABLE IF NOT EXISTS keywords ( - id INTEGER PRIMARY KEY, - keyword TEXT NOT NULL UNIQUE - ) - """ - ) - - conn.execute( - """ - CREATE TABLE IF NOT EXISTS file_keywords ( - file_id INTEGER NOT NULL, - keyword_id INTEGER NOT NULL, - PRIMARY KEY (file_id, keyword_id), - FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE, - FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE - ) - """ - ) - - # Code relationships table for graph visualization - conn.execute( - """ - CREATE TABLE IF NOT EXISTS code_relationships ( - id INTEGER PRIMARY KEY, - source_symbol_id INTEGER NOT NULL, - target_qualified_name TEXT NOT NULL, - relationship_type TEXT NOT NULL, - source_line INTEGER NOT NULL, - target_file TEXT, - FOREIGN KEY (source_symbol_id) REFERENCES symbols (id) ON DELETE CASCADE - ) - """ - ) - - # Precomputed graph neighbors cache for search expansion (v7) - conn.execute( - """ - CREATE TABLE IF NOT EXISTS graph_neighbors ( - source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE, - neighbor_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE, - relationship_depth INTEGER NOT NULL, - PRIMARY KEY (source_symbol_id, neighbor_symbol_id) - ) - """ - ) - - # Merkle hashes for incremental change detection (v8) - conn.execute( - """ - CREATE TABLE IF NOT EXISTS merkle_hashes ( - file_id INTEGER PRIMARY KEY REFERENCES files(id) ON DELETE CASCADE, - sha256 TEXT NOT NULL, - updated_at REAL - ) - """ - ) - - conn.execute( - """ - CREATE TABLE IF NOT EXISTS merkle_state ( - id INTEGER PRIMARY KEY CHECK (id = 1), - root_hash TEXT, - updated_at REAL - ) - """ - ) - - # Indexes (v5: removed idx_symbols_type) - conn.execute("CREATE INDEX IF NOT EXISTS idx_files_name ON files(name)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(full_path)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords(keyword)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords(file_id)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_keyword_id ON file_keywords(keyword_id)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_type ON code_relationships(relationship_type)") - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_graph_neighbors_source_depth " - "ON graph_neighbors(source_symbol_id, relationship_depth)" - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_graph_neighbors_neighbor " - "ON graph_neighbors(neighbor_symbol_id)" - ) - - except sqlite3.DatabaseError as exc: - raise StorageError(f"Failed to create schema: {exc}") from exc - - def _migrate_v2_add_name_column(self, conn: sqlite3.Connection) -> None: - """Migration v2: Add 'name' column to files table. - - Required for FTS5 external content table. - - Args: - conn: Database connection - """ - # Check if files table exists and has columns - cursor = conn.execute("PRAGMA table_info(files)") - files_columns = {row[1] for row in cursor.fetchall()} - - if not files_columns: - return # No files table yet, will be created fresh - - # Skip if 'name' column already exists - if "name" in files_columns: - return - - # Add 'name' column with default value - conn.execute("ALTER TABLE files ADD COLUMN name TEXT NOT NULL DEFAULT ''") - - # Populate 'name' column from full_path using pathlib for robustness - rows = conn.execute("SELECT id, full_path FROM files WHERE name = ''").fetchall() - for row in rows: - file_id = row[0] - full_path = row[1] - # Use pathlib.Path.name for cross-platform compatibility - name = Path(full_path).name if full_path else "" - conn.execute("UPDATE files SET name = ? WHERE id = ?", (name, file_id)) - - def _create_fts_triggers(self, conn: sqlite3.Connection) -> None: - """Create FTS5 external content triggers for dual FTS tables. - - Creates synchronized triggers for both files_fts_exact and files_fts_fuzzy tables. - - Args: - conn: Database connection - """ - # Insert triggers for files_fts_exact - conn.execute( - """ - CREATE TRIGGER IF NOT EXISTS files_exact_ai AFTER INSERT ON files BEGIN - INSERT INTO files_fts_exact(rowid, name, full_path, content) - VALUES(new.id, new.name, new.full_path, new.content); - END - """ - ) - - # Delete trigger for files_fts_exact - conn.execute( - """ - CREATE TRIGGER IF NOT EXISTS files_exact_ad AFTER DELETE ON files BEGIN - INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content) - VALUES('delete', old.id, old.name, old.full_path, old.content); - END - """ - ) - - # Update trigger for files_fts_exact - conn.execute( - """ - CREATE TRIGGER IF NOT EXISTS files_exact_au AFTER UPDATE ON files BEGIN - INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content) - VALUES('delete', old.id, old.name, old.full_path, old.content); - INSERT INTO files_fts_exact(rowid, name, full_path, content) - VALUES(new.id, new.name, new.full_path, new.content); - END - """ - ) - - # Insert trigger for files_fts_fuzzy - conn.execute( - """ - CREATE TRIGGER IF NOT EXISTS files_fuzzy_ai AFTER INSERT ON files BEGIN - INSERT INTO files_fts_fuzzy(rowid, name, full_path, content) - VALUES(new.id, new.name, new.full_path, new.content); - END - """ - ) - - # Delete trigger for files_fts_fuzzy - conn.execute( - """ - CREATE TRIGGER IF NOT EXISTS files_fuzzy_ad AFTER DELETE ON files BEGIN - INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content) - VALUES('delete', old.id, old.name, old.full_path, old.content); - END - """ - ) - - # Update trigger for files_fts_fuzzy - conn.execute( - """ - CREATE TRIGGER IF NOT EXISTS files_fuzzy_au AFTER UPDATE ON files BEGIN - INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content) - VALUES('delete', old.id, old.name, old.full_path, old.content); - INSERT INTO files_fts_fuzzy(rowid, name, full_path, content) - VALUES(new.id, new.name, new.full_path, new.content); - END - """ - ) diff --git a/codex-lens/src/codexlens/storage/file_cache.py b/codex-lens/src/codexlens/storage/file_cache.py deleted file mode 100644 index b43613d1..00000000 --- a/codex-lens/src/codexlens/storage/file_cache.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Simple filesystem cache helpers.""" - -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path -from typing import Optional - - -@dataclass -class FileCache: - """Caches file mtimes for incremental indexing.""" - - cache_path: Path - - def load_mtime(self, path: Path) -> Optional[float]: - try: - key = self._key_for(path) - record = (self.cache_path / key).read_text(encoding="utf-8") - return float(record) - except Exception: - return None - - def store_mtime(self, path: Path, mtime: float) -> None: - self.cache_path.mkdir(parents=True, exist_ok=True) - key = self._key_for(path) - (self.cache_path / key).write_text(str(mtime), encoding="utf-8") - - def _key_for(self, path: Path) -> str: - safe = str(path).replace(":", "_").replace("\\", "_").replace("/", "_") - return f"{safe}.mtime" - diff --git a/codex-lens/src/codexlens/storage/global_index.py b/codex-lens/src/codexlens/storage/global_index.py deleted file mode 100644 index b2d9a453..00000000 --- a/codex-lens/src/codexlens/storage/global_index.py +++ /dev/null @@ -1,618 +0,0 @@ -"""Global cross-directory symbol index for fast lookups. - -Stores symbols for an entire project in a single SQLite database so symbol search -does not require traversing every directory _index.db. - -This index is updated incrementally during file indexing (delete+insert per file) -to avoid expensive batch rebuilds. -""" - -from __future__ import annotations - -import logging -import sqlite3 -import threading -from pathlib import Path -from typing import List, Optional, Tuple - -from codexlens.entities import CodeRelationship, Symbol -from codexlens.errors import StorageError - - -class GlobalSymbolIndex: - """Project-wide symbol index with incremental updates.""" - - SCHEMA_VERSION = 2 - DEFAULT_DB_NAME = "_global_symbols.db" - - def __init__(self, db_path: str | Path, project_id: int) -> None: - self.db_path = Path(db_path).resolve() - self.project_id = int(project_id) - self._lock = threading.RLock() - self._conn: Optional[sqlite3.Connection] = None - self.logger = logging.getLogger(__name__) - - def initialize(self) -> None: - """Create database and schema if not exists.""" - with self._lock: - self.db_path.parent.mkdir(parents=True, exist_ok=True) - conn = self._get_connection() - - current_version = self._get_schema_version(conn) - if current_version > self.SCHEMA_VERSION: - raise StorageError( - f"Database schema version {current_version} is newer than " - f"supported version {self.SCHEMA_VERSION}. " - f"Please update the application or use a compatible database.", - db_path=str(self.db_path), - operation="initialize", - details={ - "current_version": current_version, - "supported_version": self.SCHEMA_VERSION, - }, - ) - - if current_version == 0: - self._create_schema(conn) - self._set_schema_version(conn, self.SCHEMA_VERSION) - elif current_version < self.SCHEMA_VERSION: - self._apply_migrations(conn, current_version) - self._set_schema_version(conn, self.SCHEMA_VERSION) - - conn.commit() - - def close(self) -> None: - """Close database connection.""" - with self._lock: - if self._conn is not None: - try: - self._conn.close() - except Exception: - pass - finally: - self._conn = None - - def __enter__(self) -> "GlobalSymbolIndex": - self.initialize() - return self - - def __exit__(self, exc_type: object, exc: object, tb: object) -> None: - self.close() - - def add_symbol(self, symbol: Symbol, file_path: str | Path, index_path: str | Path) -> None: - """Insert a single symbol (idempotent) for incremental updates.""" - file_path_str = str(Path(file_path).resolve()) - index_path_str = str(Path(index_path).resolve()) - - with self._lock: - conn = self._get_connection() - try: - conn.execute( - """ - INSERT INTO global_symbols( - project_id, symbol_name, symbol_kind, - file_path, start_line, end_line, index_path - ) - VALUES(?, ?, ?, ?, ?, ?, ?) - ON CONFLICT( - project_id, symbol_name, symbol_kind, - file_path, start_line, end_line - ) - DO UPDATE SET - index_path=excluded.index_path - """, - ( - self.project_id, - symbol.name, - symbol.kind, - file_path_str, - symbol.range[0], - symbol.range[1], - index_path_str, - ), - ) - conn.commit() - except sqlite3.DatabaseError as exc: - conn.rollback() - raise StorageError( - f"Failed to add symbol {symbol.name}: {exc}", - db_path=str(self.db_path), - operation="add_symbol", - ) from exc - - def update_file_symbols( - self, - file_path: str | Path, - symbols: List[Symbol], - index_path: str | Path | None = None, - ) -> None: - """Replace all symbols for a file atomically (delete + insert).""" - file_path_str = str(Path(file_path).resolve()) - - index_path_str: Optional[str] - if index_path is not None: - index_path_str = str(Path(index_path).resolve()) - else: - index_path_str = self._get_existing_index_path(file_path_str) - - with self._lock: - conn = self._get_connection() - try: - conn.execute("BEGIN") - conn.execute( - "DELETE FROM global_symbols WHERE project_id=? AND file_path=?", - (self.project_id, file_path_str), - ) - - if symbols: - if not index_path_str: - raise StorageError( - "index_path is required when inserting symbols for a new file", - db_path=str(self.db_path), - operation="update_file_symbols", - details={"file_path": file_path_str}, - ) - - rows = [ - ( - self.project_id, - s.name, - s.kind, - file_path_str, - s.range[0], - s.range[1], - index_path_str, - ) - for s in symbols - ] - conn.executemany( - """ - INSERT INTO global_symbols( - project_id, symbol_name, symbol_kind, - file_path, start_line, end_line, index_path - ) - VALUES(?, ?, ?, ?, ?, ?, ?) - ON CONFLICT( - project_id, symbol_name, symbol_kind, - file_path, start_line, end_line - ) - DO UPDATE SET - index_path=excluded.index_path - """, - rows, - ) - - conn.commit() - except sqlite3.DatabaseError as exc: - conn.rollback() - raise StorageError( - f"Failed to update symbols for {file_path_str}: {exc}", - db_path=str(self.db_path), - operation="update_file_symbols", - ) from exc - - def delete_file_symbols(self, file_path: str | Path) -> int: - """Remove all symbols for a file. Returns number of rows deleted.""" - file_path_str = str(Path(file_path).resolve()) - with self._lock: - conn = self._get_connection() - try: - cur = conn.execute( - "DELETE FROM global_symbols WHERE project_id=? AND file_path=?", - (self.project_id, file_path_str), - ) - conn.commit() - return int(cur.rowcount or 0) - except sqlite3.DatabaseError as exc: - conn.rollback() - raise StorageError( - f"Failed to delete symbols for {file_path_str}: {exc}", - db_path=str(self.db_path), - operation="delete_file_symbols", - ) from exc - - def search( - self, - name: str, - kind: Optional[str] = None, - limit: int = 50, - prefix_mode: bool = True, - ) -> List[Symbol]: - """Search symbols and return full Symbol objects.""" - if prefix_mode: - pattern = f"{name}%" - else: - pattern = f"%{name}%" - - with self._lock: - conn = self._get_connection() - if kind: - rows = conn.execute( - """ - SELECT symbol_name, symbol_kind, file_path, start_line, end_line - FROM global_symbols - WHERE project_id=? AND symbol_name LIKE ? AND symbol_kind=? - ORDER BY symbol_name - LIMIT ? - """, - (self.project_id, pattern, kind, limit), - ).fetchall() - else: - rows = conn.execute( - """ - SELECT symbol_name, symbol_kind, file_path, start_line, end_line - FROM global_symbols - WHERE project_id=? AND symbol_name LIKE ? - ORDER BY symbol_name - LIMIT ? - """, - (self.project_id, pattern, limit), - ).fetchall() - - return [ - Symbol( - name=row["symbol_name"], - kind=row["symbol_kind"], - range=(row["start_line"], row["end_line"]), - file=row["file_path"], - ) - for row in rows - ] - - def search_symbols( - self, - name: str, - kind: Optional[str] = None, - limit: int = 50, - prefix_mode: bool = True, - ) -> List[Tuple[str, Tuple[int, int]]]: - """Search symbols and return only (file_path, (start_line, end_line)).""" - symbols = self.search(name=name, kind=kind, limit=limit, prefix_mode=prefix_mode) - return [(s.file or "", s.range) for s in symbols] - - def get_file_symbols(self, file_path: str | Path) -> List[Symbol]: - """Get all symbols in a specific file, sorted by start_line. - - Args: - file_path: Full path to the file - - Returns: - List of Symbol objects sorted by start_line - """ - file_path_str = str(Path(file_path).resolve()) - - with self._lock: - conn = self._get_connection() - rows = conn.execute( - """ - SELECT symbol_name, symbol_kind, file_path, start_line, end_line - FROM global_symbols - WHERE project_id=? AND file_path=? - ORDER BY start_line - """, - (self.project_id, file_path_str), - ).fetchall() - - return [ - Symbol( - name=row["symbol_name"], - kind=row["symbol_kind"], - range=(row["start_line"], row["end_line"]), - file=row["file_path"], - ) - for row in rows - ] - - # ------------------------------------------------------------------ - # Relationship CRUD - # ------------------------------------------------------------------ - - def update_file_relationships( - self, - file_path: str | Path, - relationships: List[CodeRelationship], - ) -> None: - """Replace all relationships for a file atomically (delete + insert). - - Uses the same delete-then-insert pattern as ``update_file_symbols``. - The *target_qualified_name* stored in the DB is built from - ``target_file`` (when available) and ``target_symbol`` so that - cross-directory lookups work correctly. - """ - file_path_str = str(Path(file_path).resolve()) - - with self._lock: - conn = self._get_connection() - try: - conn.execute("BEGIN") - conn.execute( - "DELETE FROM global_relationships WHERE project_id=? AND source_file=?", - (self.project_id, file_path_str), - ) - - if relationships: - rows = [ - ( - self.project_id, - file_path_str, - rel.source_symbol, - self._build_qualified_name(rel), - rel.relationship_type.value, - rel.source_line, - ) - for rel in relationships - ] - conn.executemany( - """ - INSERT INTO global_relationships( - project_id, source_file, source_symbol, - target_qualified_name, relationship_type, source_line - ) - VALUES(?, ?, ?, ?, ?, ?) - """, - rows, - ) - - conn.commit() - except sqlite3.DatabaseError as exc: - conn.rollback() - raise StorageError( - f"Failed to update relationships for {file_path_str}: {exc}", - db_path=str(self.db_path), - operation="update_file_relationships", - ) from exc - - def query_by_target( - self, - target_name: str, - limit: int = 50, - prefix_mode: bool = True, - ) -> List[Tuple[str, str, str, int]]: - """Query relationships by target_qualified_name. - - Returns list of ``(source_file, source_symbol, relationship_type, source_line)``. - When *prefix_mode* is True the target_name is matched as a prefix; - otherwise an exact match is required. - """ - if prefix_mode: - pattern = f"{target_name}%" - else: - pattern = target_name - - with self._lock: - conn = self._get_connection() - if prefix_mode: - rows = conn.execute( - """ - SELECT source_file, source_symbol, relationship_type, source_line - FROM global_relationships - WHERE project_id=? AND target_qualified_name LIKE ? - ORDER BY source_file, source_line - LIMIT ? - """, - (self.project_id, pattern, limit), - ).fetchall() - else: - rows = conn.execute( - """ - SELECT source_file, source_symbol, relationship_type, source_line - FROM global_relationships - WHERE project_id=? AND target_qualified_name=? - ORDER BY source_file, source_line - LIMIT ? - """, - (self.project_id, pattern, limit), - ).fetchall() - - return [ - ( - row["source_file"], - row["source_symbol"], - row["relationship_type"], - row["source_line"], - ) - for row in rows - ] - - def query_relationships_for_symbols( - self, - symbol_names: List[str], - limit: int = 100, - ) -> List[sqlite3.Row]: - """Query all relationships involving any of *symbol_names*. - - Matches against both ``source_symbol`` and ``target_qualified_name`` - (the target column is checked with a LIKE ``%name%`` pattern so that - qualified names like ``mod.ClassName`` still match ``ClassName``). - """ - if not symbol_names: - return [] - - with self._lock: - conn = self._get_connection() - # Build WHERE clause: (source_symbol IN (...)) OR (target LIKE ...) - source_placeholders = ",".join("?" for _ in symbol_names) - target_clauses = " OR ".join( - "target_qualified_name LIKE ?" for _ in symbol_names - ) - target_patterns = [f"%{name}" for name in symbol_names] - - sql = f""" - SELECT id, project_id, source_file, source_symbol, - target_qualified_name, relationship_type, source_line - FROM global_relationships - WHERE project_id=? - AND ( - source_symbol IN ({source_placeholders}) - OR ({target_clauses}) - ) - ORDER BY source_file, source_line - LIMIT ? - """ - params: list = [self.project_id, *symbol_names, *target_patterns, limit] - return conn.execute(sql, params).fetchall() - - def delete_file_relationships(self, file_path: str | Path) -> int: - """Remove all relationships for a file. Returns number of rows deleted.""" - file_path_str = str(Path(file_path).resolve()) - with self._lock: - conn = self._get_connection() - try: - cur = conn.execute( - "DELETE FROM global_relationships WHERE project_id=? AND source_file=?", - (self.project_id, file_path_str), - ) - conn.commit() - return int(cur.rowcount or 0) - except sqlite3.DatabaseError as exc: - conn.rollback() - raise StorageError( - f"Failed to delete relationships for {file_path_str}: {exc}", - db_path=str(self.db_path), - operation="delete_file_relationships", - ) from exc - - @staticmethod - def _build_qualified_name(rel: CodeRelationship) -> str: - """Build a qualified name from a CodeRelationship. - - Format: ``::`` when target_file is known, - otherwise just ````. - """ - if rel.target_file: - return f"{rel.target_file}::{rel.target_symbol}" - return rel.target_symbol - - def _get_existing_index_path(self, file_path_str: str) -> Optional[str]: - with self._lock: - conn = self._get_connection() - row = conn.execute( - """ - SELECT index_path - FROM global_symbols - WHERE project_id=? AND file_path=? - LIMIT 1 - """, - (self.project_id, file_path_str), - ).fetchone() - return str(row["index_path"]) if row else None - - def _get_schema_version(self, conn: sqlite3.Connection) -> int: - try: - row = conn.execute("PRAGMA user_version").fetchone() - return int(row[0]) if row else 0 - except Exception: - return 0 - - def _set_schema_version(self, conn: sqlite3.Connection, version: int) -> None: - conn.execute(f"PRAGMA user_version = {int(version)}") - - def _apply_migrations(self, conn: sqlite3.Connection, from_version: int) -> None: - if from_version < 2: - self._migrate_v1_to_v2(conn) - - def _migrate_v1_to_v2(self, conn: sqlite3.Connection) -> None: - """Add global_relationships table for v1 -> v2 migration.""" - try: - self._create_relationships_schema(conn) - except sqlite3.DatabaseError as exc: - raise StorageError( - f"Failed to migrate schema from v1 to v2: {exc}", - db_path=str(self.db_path), - operation="_migrate_v1_to_v2", - ) from exc - - def _get_connection(self) -> sqlite3.Connection: - if self._conn is None: - self._conn = sqlite3.connect(str(self.db_path), check_same_thread=False) - self._conn.row_factory = sqlite3.Row - self._conn.execute("PRAGMA journal_mode=WAL") - self._conn.execute("PRAGMA synchronous=NORMAL") - self._conn.execute("PRAGMA foreign_keys=ON") - self._conn.execute("PRAGMA mmap_size=30000000000") - return self._conn - - def _create_schema(self, conn: sqlite3.Connection) -> None: - try: - conn.execute( - """ - CREATE TABLE IF NOT EXISTS global_symbols ( - id INTEGER PRIMARY KEY, - project_id INTEGER NOT NULL, - symbol_name TEXT NOT NULL, - symbol_kind TEXT NOT NULL, - file_path TEXT NOT NULL, - start_line INTEGER, - end_line INTEGER, - index_path TEXT NOT NULL, - UNIQUE( - project_id, symbol_name, symbol_kind, - file_path, start_line, end_line - ) - ) - """ - ) - - # Required by optimization spec. - conn.execute( - """ - CREATE INDEX IF NOT EXISTS idx_global_symbols_name_kind - ON global_symbols(symbol_name, symbol_kind) - """ - ) - # Used by common queries (project-scoped name lookups). - conn.execute( - """ - CREATE INDEX IF NOT EXISTS idx_global_symbols_project_name_kind - ON global_symbols(project_id, symbol_name, symbol_kind) - """ - ) - conn.execute( - """ - CREATE INDEX IF NOT EXISTS idx_global_symbols_project_file - ON global_symbols(project_id, file_path) - """ - ) - conn.execute( - """ - CREATE INDEX IF NOT EXISTS idx_global_symbols_project_index_path - ON global_symbols(project_id, index_path) - """ - ) - - self._create_relationships_schema(conn) - except sqlite3.DatabaseError as exc: - raise StorageError( - f"Failed to initialize global symbol schema: {exc}", - db_path=str(self.db_path), - operation="_create_schema", - ) from exc - - def _create_relationships_schema(self, conn: sqlite3.Connection) -> None: - """Create the global_relationships table and indexes (idempotent).""" - conn.execute( - """ - CREATE TABLE IF NOT EXISTS global_relationships ( - id INTEGER PRIMARY KEY, - project_id INTEGER NOT NULL, - source_file TEXT NOT NULL, - source_symbol TEXT NOT NULL, - target_qualified_name TEXT NOT NULL, - relationship_type TEXT NOT NULL, - source_line INTEGER NOT NULL - ) - """ - ) - conn.execute( - """ - CREATE INDEX IF NOT EXISTS idx_global_rel_project_target - ON global_relationships(project_id, target_qualified_name) - """ - ) - conn.execute( - """ - CREATE INDEX IF NOT EXISTS idx_global_rel_project_source - ON global_relationships(project_id, source_file) - """ - ) - diff --git a/codex-lens/src/codexlens/storage/index_filters.py b/codex-lens/src/codexlens/storage/index_filters.py deleted file mode 100644 index 4f4a163f..00000000 --- a/codex-lens/src/codexlens/storage/index_filters.py +++ /dev/null @@ -1,47 +0,0 @@ -from __future__ import annotations - -from pathlib import Path -from typing import Iterable, List, Optional, Set - -from codexlens.storage.index_tree import DEFAULT_IGNORE_DIRS - - -EXTRA_IGNORED_INDEX_DIRS = frozenset({".workflow"}) -IGNORED_INDEX_DIRS = frozenset({name.casefold() for name in DEFAULT_IGNORE_DIRS | set(EXTRA_IGNORED_INDEX_DIRS)}) - - -def is_ignored_index_path( - index_path: Path, - scan_root: Path, - *, - ignored_dir_names: Optional[Set[str]] = None, -) -> bool: - """Return True when an index lives under an ignored/generated subtree.""" - - ignored = ( - {name.casefold() for name in ignored_dir_names} - if ignored_dir_names is not None - else IGNORED_INDEX_DIRS - ) - - try: - relative_parts = index_path.resolve().relative_to(scan_root.resolve()).parts[:-1] - except ValueError: - return False - - return any(part.casefold() in ignored for part in relative_parts) - - -def filter_index_paths( - index_paths: Iterable[Path], - scan_root: Path, - *, - ignored_dir_names: Optional[Set[str]] = None, -) -> List[Path]: - """Filter out discovered indexes that belong to ignored/generated subtrees.""" - - return [ - path - for path in index_paths - if not is_ignored_index_path(path, scan_root, ignored_dir_names=ignored_dir_names) - ] diff --git a/codex-lens/src/codexlens/storage/index_tree.py b/codex-lens/src/codexlens/storage/index_tree.py deleted file mode 100644 index 0a7f7894..00000000 --- a/codex-lens/src/codexlens/storage/index_tree.py +++ /dev/null @@ -1,1320 +0,0 @@ -"""Hierarchical index tree builder for CodexLens. - -Constructs a bottom-up directory index tree with parallel processing support. -Each directory maintains its own _index.db with files and subdirectory links. -""" - -from __future__ import annotations - -import fnmatch -import logging -import os -import re -import sqlite3 -import time -from concurrent.futures import ProcessPoolExecutor, as_completed -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List, Optional, Set, Tuple - -from codexlens.config import Config -from codexlens.parsers.factory import ParserFactory -from codexlens.storage.dir_index import DirIndexStore -from codexlens.storage.global_index import GlobalSymbolIndex -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import ProjectInfo, RegistryStore - - -DEFAULT_IGNORE_DIRS: Set[str] = { - ".git", - ".svn", - ".hg", - ".venv", - "venv", - "env", - "node_modules", - "bower_components", - "__pycache__", - ".pytest_cache", - ".mypy_cache", - ".ruff_cache", - ".npm", - ".yarn", - ".codexlens", - ".idea", - ".vscode", - ".vs", - ".eclipse", - "dist", - "build", - "out", - "target", - "bin", - "obj", - "_build", - "coverage", - "htmlcov", - ".cache", - ".parcel-cache", - ".turbo", - ".next", - ".nuxt", - "logs", - "tmp", - "temp", -} - - -@dataclass -class BuildResult: - """Complete build operation result.""" - - project_id: int - source_root: Path - index_root: Path - total_files: int - total_dirs: int - errors: List[str] - - -@dataclass -class DirBuildResult: - """Single directory build result.""" - - source_path: Path - index_path: Path - files_count: int - symbols_count: int - subdirs: List[str] # Subdirectory names - error: Optional[str] = None - - -class IndexTreeBuilder: - """Hierarchical index tree builder with parallel processing. - - Builds directory indexes bottom-up to enable proper subdirectory linking. - Each directory gets its own _index.db containing: - - Files in that directory - - Links to child directory indexes - - Symbols and FTS5 search - - Attributes: - registry: Global project registry - mapper: Path mapping between source and index - config: CodexLens configuration - parser_factory: Parser factory for symbol extraction - logger: Logger instance - IGNORE_DIRS: Set of directory names to skip during indexing - """ - - # Directories to skip during indexing - IGNORE_DIRS: Set[str] = DEFAULT_IGNORE_DIRS - - def __init__( - self, registry: RegistryStore, mapper: PathMapper, config: Config = None, incremental: bool = True - ): - """Initialize the index tree builder. - - Args: - registry: Global registry store for project tracking - mapper: Path mapper for source to index conversions - config: CodexLens configuration (uses defaults if None) - incremental: Enable incremental indexing (default True) - """ - self.registry = registry - self.mapper = mapper - self.config = config or Config.load() - self.parser_factory = ParserFactory(self.config) - self.logger = logging.getLogger(__name__) - self.incremental = incremental - self.ignore_patterns = self._resolve_ignore_patterns() - self.extension_filters = self._resolve_extension_filters() - - def _resolve_ignore_patterns(self) -> Tuple[str, ...]: - configured_patterns = getattr(self.config, "ignore_patterns", None) - raw_patterns = configured_patterns if configured_patterns else list(DEFAULT_IGNORE_DIRS) - cleaned: List[str] = [] - for item in raw_patterns: - pattern = str(item).strip().replace('\\', '/').rstrip('/') - if pattern: - cleaned.append(pattern) - return tuple(dict.fromkeys(cleaned)) - - def _resolve_extension_filters(self) -> Tuple[str, ...]: - configured_filters = getattr(self.config, "extension_filters", None) - if not configured_filters: - return tuple() - - cleaned: List[str] = [] - for item in configured_filters: - pattern = str(item).strip().replace('\\', '/').rstrip('/') - if pattern: - cleaned.append(pattern) - return tuple(dict.fromkeys(cleaned)) - - def _is_ignored_dir(self, dir_path: Path, source_root: Optional[Path] = None) -> bool: - name = dir_path.name - if name.startswith('.'): - return True - - rel_path: Optional[str] = None - if source_root is not None: - try: - rel_path = dir_path.relative_to(source_root).as_posix() - except ValueError: - rel_path = None - - for pattern in self.ignore_patterns: - if pattern == name or fnmatch.fnmatch(name, pattern): - return True - if rel_path and (pattern == rel_path or fnmatch.fnmatch(rel_path, pattern)): - return True - - return False - - def _is_filtered_file(self, file_path: Path, source_root: Optional[Path] = None) -> bool: - if not self.extension_filters: - return False - - rel_path: Optional[str] = None - if source_root is not None: - try: - rel_path = file_path.relative_to(source_root).as_posix() - except ValueError: - rel_path = None - - for pattern in self.extension_filters: - if pattern == file_path.name or fnmatch.fnmatch(file_path.name, pattern): - return True - if rel_path and (pattern == rel_path or fnmatch.fnmatch(rel_path, pattern)): - return True - - return False - - def build( - self, - source_root: Path, - languages: List[str] = None, - workers: int = None, - force_full: bool = False, - ) -> BuildResult: - """Build complete index tree for a project. - - Process: - 1. Register project in registry - 2. Collect all directories grouped by depth - 3. Build indexes bottom-up (deepest first) - 4. Link subdirectories to parents - 5. Update project statistics - 6. Cleanup deleted files (if incremental mode) - - Args: - source_root: Project root directory to index - languages: Optional list of language IDs to limit indexing - workers: Number of parallel worker processes - force_full: Force full reindex (override incremental mode) - - Returns: - BuildResult with statistics and errors - - Raises: - ValueError: If source_root doesn't exist - """ - source_root = source_root.resolve() - if not source_root.exists(): - raise ValueError(f"Source root does not exist: {source_root}") - - # Auto-detect optimal worker count if not specified - if workers is None: - workers = min(os.cpu_count() or 4, 16) # Cap at 16 workers - self.logger.debug("Auto-detected %d workers for parallel indexing", workers) - - # Override incremental mode if force_full is True - use_incremental = self.incremental and not force_full - if force_full: - self.logger.info("Building index tree for %s (FULL reindex)", source_root) - else: - self.logger.info("Building index tree for %s (incremental=%s)", source_root, use_incremental) - - # Register project - index_root = self.mapper.source_to_index_dir(source_root) - project_info = self.registry.register_project(source_root, index_root) - global_index_db_path = index_root / GlobalSymbolIndex.DEFAULT_DB_NAME - - global_index: GlobalSymbolIndex | None = None - if self.config.global_symbol_index_enabled: - global_index = GlobalSymbolIndex(global_index_db_path, project_id=project_info.id) - global_index.initialize() - - # Report progress: discovering files (5%) - print("Discovering files...", flush=True) - - # Collect directories by depth - dirs_by_depth = self._collect_dirs_by_depth(source_root, languages) - - if force_full: - pruned_dirs = self._prune_stale_project_dirs( - project_id=project_info.id, - source_root=source_root, - dirs_by_depth=dirs_by_depth, - ) - if pruned_dirs: - self.logger.info( - "Pruned %d stale directory mappings before full rebuild", - len(pruned_dirs), - ) - - if not dirs_by_depth: - self.logger.warning("No indexable directories found in %s", source_root) - if global_index is not None: - global_index.close() - return BuildResult( - project_id=project_info.id, - source_root=source_root, - index_root=index_root, - total_files=0, - total_dirs=0, - errors=["No indexable directories found"], - ) - - # Calculate total directories for progress tracking - total_dirs_to_process = sum(len(dirs) for dirs in dirs_by_depth.values()) - processed_dirs = 0 - - # Report progress: building index (10%) - print("Building index...", flush=True) - - total_files = 0 - total_dirs = 0 - all_errors: List[str] = [] - all_results: List[DirBuildResult] = [] # Store all results for subdir linking - - # Build bottom-up (highest depth first) - max_depth = max(dirs_by_depth.keys()) - for depth in range(max_depth, -1, -1): - if depth not in dirs_by_depth: - continue - - dirs = dirs_by_depth[depth] - self.logger.info("Building %d directories at depth %d", len(dirs), depth) - - # Build directories at this level in parallel - results = self._build_level_parallel( - dirs, - languages, - workers, - source_root=source_root, - project_id=project_info.id, - global_index_db_path=global_index_db_path, - ) - all_results.extend(results) - - # Process results - for result in results: - if result.error: - all_errors.append(f"{result.source_path}: {result.error}") - processed_dirs += 1 - continue - - total_files += result.files_count - total_dirs += 1 - processed_dirs += 1 - - # Report progress for each processed directory (10-80%) - # Use "Processing file" format for frontend parser compatibility - progress_percent = 10 + int((processed_dirs / total_dirs_to_process) * 70) - print(f"Processing file {processed_dirs}/{total_dirs_to_process}: {result.source_path.name}", flush=True) - - # Register directory in registry - self.registry.register_dir( - project_id=project_info.id, - source_path=result.source_path, - index_path=result.index_path, - depth=self.mapper.get_relative_depth(result.source_path, source_root), - files_count=result.files_count, - ) - - # Report progress: linking subdirectories (80%) - print("Linking subdirectories...", flush=True) - - # After building all directories, link subdirectories to parents - # This needs to happen after all indexes exist - for result in all_results: - if result.error: - continue - # Link children to this directory - self._link_children_to_parent(result.source_path, all_results) - - # Cleanup deleted files if in incremental mode - if use_incremental: - # Report progress: cleaning up (90%) - print("Cleaning up deleted files...", flush=True) - self.logger.info("Cleaning up deleted files...") - total_deleted = 0 - for result in all_results: - if result.error: - continue - try: - with DirIndexStore(result.index_path, config=self.config, global_index=global_index) as store: - deleted_count = store.cleanup_deleted_files(result.source_path) - if deleted_count > 0: - _compute_graph_neighbors(store, logger=self.logger) - store.update_merkle_root() - total_deleted += deleted_count - if deleted_count > 0: - self.logger.debug("Removed %d deleted files from %s", deleted_count, result.source_path) - except Exception as exc: - self.logger.warning("Cleanup failed for %s: %s", result.source_path, exc) - - if total_deleted > 0: - self.logger.info("Removed %d deleted files from index", total_deleted) - - # Report progress: finalizing (95%) - print("Finalizing...", flush=True) - - # Update project statistics - self.registry.update_project_stats(source_root, total_files, total_dirs) - - # Report completion (100%) - print(f"Indexed {total_files} files", flush=True) - - self.logger.info( - "Index build complete: %d files, %d directories, %d errors", - total_files, - total_dirs, - len(all_errors), - ) - - if global_index is not None: - global_index.close() - - return BuildResult( - project_id=project_info.id, - source_root=source_root, - index_root=index_root, - total_files=total_files, - total_dirs=total_dirs, - errors=all_errors, - ) - - def update_subtree( - self, - source_path: Path, - languages: List[str] = None, - workers: int = None, - ) -> BuildResult: - """Incrementally update a subtree. - - Rebuilds indexes for the specified directory and all subdirectories. - Useful for incremental updates when only part of the tree changed. - - Args: - source_path: Root of subtree to update - languages: Optional list of language IDs to limit indexing - workers: Number of parallel worker processes - - Returns: - BuildResult for the subtree - - Raises: - ValueError: If source_path is not indexed - """ - source_path = source_path.resolve() - project_root = self.mapper.get_project_root(source_path) - - # Get project info - project_info = self.registry.get_project(project_root) - if not project_info: - raise ValueError(f"Directory not indexed: {source_path}") - - self.logger.info("Updating subtree at %s", source_path) - - # Use build logic but start from source_path - return self.build(source_path, languages, workers) - - def rebuild_dir(self, source_path: Path) -> DirBuildResult: - """Rebuild index for a single directory. - - Only rebuilds the specified directory, does not touch subdirectories. - Useful for updating a single directory after file changes. - - Args: - source_path: Directory to rebuild - - Returns: - DirBuildResult for the directory - """ - source_path = source_path.resolve() - self.logger.info("Rebuilding directory %s", source_path) - project_root = self.mapper.get_project_root(source_path) - project_info = self.registry.get_project(project_root) - if not project_info: - raise ValueError(f"Directory not indexed: {source_path}") - - global_index_db_path = project_info.index_root / GlobalSymbolIndex.DEFAULT_DB_NAME - return self._build_single_dir( - source_path, - languages=None, - source_root=project_root, - project_id=project_info.id, - global_index_db_path=global_index_db_path, - ) - - # === Internal Methods === - - def _prune_stale_project_dirs( - self, - *, - project_id: int, - source_root: Path, - dirs_by_depth: Dict[int, List[Path]], - ) -> List[Path]: - """Remove registry mappings for directories no longer included in the index tree.""" - source_root = source_root.resolve() - valid_dirs: Set[Path] = { - path.resolve() - for paths in dirs_by_depth.values() - for path in paths - } - valid_dirs.add(source_root) - - stale_mappings = [] - for mapping in self.registry.get_project_dirs(project_id): - mapping_path = mapping.source_path.resolve() - if mapping_path in valid_dirs: - continue - try: - mapping_path.relative_to(source_root) - except ValueError: - continue - stale_mappings.append(mapping) - - stale_mappings.sort( - key=lambda mapping: len(mapping.source_path.resolve().relative_to(source_root).parts), - reverse=True, - ) - - pruned_paths: List[Path] = [] - for mapping in stale_mappings: - try: - if self.registry.unregister_dir(mapping.source_path): - pruned_paths.append(mapping.source_path.resolve()) - except Exception as exc: - self.logger.warning( - "Failed to prune stale mapping for %s: %s", - mapping.source_path, - exc, - ) - - return pruned_paths - - def _collect_dirs_by_depth( - self, source_root: Path, languages: List[str] = None - ) -> Dict[int, List[Path]]: - """Collect all indexable directories grouped by depth. - - Walks the directory tree and groups directories by their depth - relative to source_root. Depth 0 is the root itself. - - Args: - source_root: Root directory to start from - languages: Optional language filter - - Returns: - Dictionary mapping depth to list of directory paths - Example: {0: [root], 1: [src, tests], 2: [src/api, src/utils]} - """ - source_root = source_root.resolve() - dirs_by_depth: Dict[int, List[Path]] = {} - - # Always include the root directory at depth 0 for chain search entry point - dirs_by_depth[0] = [source_root] - - for root, dirnames, _ in os.walk(source_root): - # Filter out ignored directories - root_path = Path(root) - dirnames[:] = [ - d - for d in dirnames - if not self._is_ignored_dir(root_path / d, source_root) - ] - - root_path = Path(root) - - # Skip root (already added) - if root_path == source_root: - continue - - # Check if this directory should be indexed - if not self._should_index_dir(root_path, languages, source_root=source_root): - continue - - # Calculate depth relative to source_root - try: - depth = len(root_path.relative_to(source_root).parts) - except ValueError: - continue - - if depth not in dirs_by_depth: - dirs_by_depth[depth] = [] - - dirs_by_depth[depth].append(root_path) - - return dirs_by_depth - - def _should_index_dir(self, dir_path: Path, languages: List[str] = None, source_root: Optional[Path] = None) -> bool: - """Check if directory should be indexed. - - A directory is indexed if: - 1. It's not in IGNORE_DIRS - 2. It doesn't start with '.' - 3. It contains at least one supported language file, OR - 4. It has subdirectories that contain supported files (transitive) - - Args: - dir_path: Directory to check - languages: Optional language filter - - Returns: - True if directory should be indexed - """ - # Check directory name - if self._is_ignored_dir(dir_path, source_root): - return False - - # Check for supported files in this directory - source_files = self._iter_source_files(dir_path, languages, source_root=source_root) - if len(source_files) > 0: - return True - - # Check if any subdirectory has indexable files (transitive) - # This handles cases like 'src' which has no direct files but has 'src/codexlens' - for item in dir_path.iterdir(): - if not item.is_dir(): - continue - if self._is_ignored_dir(item, source_root): - continue - # Recursively check subdirectories - if self._has_indexable_files_recursive(item, languages, source_root=source_root): - return True - - return False - - def _has_indexable_files_recursive(self, dir_path: Path, languages: List[str] = None, source_root: Optional[Path] = None) -> bool: - """Check if directory or any subdirectory has indexable files. - - Args: - dir_path: Directory to check - languages: Optional language filter - - Returns: - True if directory tree contains indexable files - """ - # Check for supported files in this directory - source_files = self._iter_source_files(dir_path, languages, source_root=source_root) - if len(source_files) > 0: - return True - - # Check subdirectories - try: - for item in dir_path.iterdir(): - if not item.is_dir(): - continue - if self._is_ignored_dir(item, source_root): - continue - if self._has_indexable_files_recursive(item, languages, source_root=source_root): - return True - except PermissionError: - pass - - return False - - def _build_level_parallel( - self, - dirs: List[Path], - languages: List[str], - workers: int, - *, - source_root: Path, - project_id: int, - global_index_db_path: Path, - ) -> List[DirBuildResult]: - """Build multiple directories in parallel. - - Uses ProcessPoolExecutor to build directories concurrently. - All directories at the same level are independent and can be - processed in parallel. - - Args: - dirs: List of directories to build - languages: Language filter - workers: Number of worker processes - - Returns: - List of DirBuildResult objects - """ - results: List[DirBuildResult] = [] - - if not dirs: - return results - - # For single directory, avoid overhead of process pool - if len(dirs) == 1: - result = self._build_single_dir( - dirs[0], - languages, - source_root=source_root, - project_id=project_id, - global_index_db_path=global_index_db_path, - ) - return [result] - - # Prepare arguments for worker processes - config_dict = { - "data_dir": str(self.config.data_dir), - "supported_languages": self.config.supported_languages, - "parsing_rules": self.config.parsing_rules, - "global_symbol_index_enabled": self.config.global_symbol_index_enabled, - "static_graph_enabled": self.config.static_graph_enabled, - "static_graph_relationship_types": self.config.static_graph_relationship_types, - "use_astgrep": getattr(self.config, "use_astgrep", False), - "ignore_patterns": list(self.ignore_patterns), - "extension_filters": list(self.extension_filters), - "incremental": bool(self.incremental), - } - - worker_args = [ - ( - dir_path, - self.mapper.source_to_index_db(dir_path), - languages, - config_dict, - int(project_id), - str(global_index_db_path), - str(source_root), - ) - for dir_path in dirs - ] - - # Execute in parallel - with ProcessPoolExecutor(max_workers=workers) as executor: - futures = { - executor.submit(_build_dir_worker, args): args[0] - for args in worker_args - } - - for future in as_completed(futures): - try: - result = future.result() - results.append(result) - except Exception as exc: - dir_path = futures[future] - self.logger.error("Failed to build %s: %s", dir_path, exc) - results.append( - DirBuildResult( - source_path=dir_path, - index_path=self.mapper.source_to_index_db(dir_path), - files_count=0, - symbols_count=0, - subdirs=[], - error=str(exc), - ) - ) - - return results - - def _build_single_dir( - self, - dir_path: Path, - languages: List[str] = None, - *, - source_root: Path, - project_id: int, - global_index_db_path: Path, - ) -> DirBuildResult: - """Build index for a single directory. - - Creates _index.db and indexes all files in the directory. - Does not recurse into subdirectories. - - Args: - dir_path: Directory to index - languages: Optional language filter - - Returns: - DirBuildResult with statistics and subdirectory list - """ - dir_path = dir_path.resolve() - index_db_path = self.mapper.source_to_index_db(dir_path) - - global_index: GlobalSymbolIndex | None = None - try: - # Ensure index directory exists - index_db_path.parent.mkdir(parents=True, exist_ok=True) - - if not self.incremental: - _reset_index_db_files(index_db_path) - - # Create directory index - if self.config.global_symbol_index_enabled: - global_index = GlobalSymbolIndex(global_index_db_path, project_id=project_id) - global_index.initialize() - - store = DirIndexStore(index_db_path, config=self.config, global_index=global_index) - store.initialize() - - # Get source files in this directory only - source_files = self._iter_source_files(dir_path, languages, source_root=source_root) - - files_count = 0 - symbols_count = 0 - skipped_count = 0 - - for file_path in source_files: - try: - # Check if file needs reindexing (incremental mode) - if self.incremental and not store.needs_reindex(file_path): - skipped_count += 1 - continue - - # Read and parse file - text = file_path.read_text(encoding="utf-8", errors="ignore") - language_id = self.config.language_for_path(file_path) - if not language_id: - continue - - parser = self.parser_factory.get_parser(language_id) - indexed_file = parser.parse(text, file_path) - - # Add to directory index - store.add_file( - name=file_path.name, - full_path=file_path, - content=text, - language=language_id, - symbols=indexed_file.symbols, - relationships=indexed_file.relationships, - ) - - # Write global relationships if enabled - if ( - self.config.static_graph_enabled - and global_index is not None - and indexed_file.relationships - ): - try: - filtered_rels = [ - r for r in indexed_file.relationships - if r.relationship_type.value in self.config.static_graph_relationship_types - ] - if filtered_rels: - global_index.update_file_relationships( - file_path, filtered_rels - ) - except Exception as rel_exc: - self.logger.warning( - "Failed to write global relationships for %s: %s", - file_path, rel_exc, - ) - - files_count += 1 - symbols_count += len(indexed_file.symbols) - - except Exception as exc: - self.logger.debug("Failed to index %s: %s", file_path, exc) - continue - - if files_count > 0: - _compute_graph_neighbors(store, logger=self.logger) - - # Get list of subdirectories - subdirs = [ - d.name - for d in dir_path.iterdir() - if d.is_dir() - and not self._is_ignored_dir(d, source_root=source_root) - ] - - store.update_merkle_root() - store.close() - if global_index is not None: - global_index.close() - - if skipped_count > 0: - self.logger.debug( - "Built %s: %d files indexed, %d skipped (unchanged), %d symbols, %d subdirs", - dir_path, - files_count, - skipped_count, - symbols_count, - len(subdirs), - ) - else: - self.logger.debug( - "Built %s: %d files, %d symbols, %d subdirs", - dir_path, - files_count, - symbols_count, - len(subdirs), - ) - - return DirBuildResult( - source_path=dir_path, - index_path=index_db_path, - files_count=files_count, - symbols_count=symbols_count, - subdirs=subdirs, - ) - - except Exception as exc: - self.logger.error("Failed to build directory %s: %s", dir_path, exc) - if global_index is not None: - try: - global_index.close() - except Exception: - pass - return DirBuildResult( - source_path=dir_path, - index_path=index_db_path, - files_count=0, - symbols_count=0, - subdirs=[], - error=str(exc), - ) - - def _link_children_to_parent( - self, parent_path: Path, all_results: List[DirBuildResult] - ) -> None: - """Link child directory indexes to parent's subdirs table. - - Finds all direct children of parent_path in all_results and - registers them as subdirectories in the parent's index. - - Args: - parent_path: Parent directory path - all_results: List of all build results - """ - parent_index_db = self.mapper.source_to_index_db(parent_path) - - try: - with DirIndexStore(parent_index_db, config=self.config) as store: - for result in all_results: - # Only register direct children (parent is one level up) - if result.source_path.parent != parent_path: - continue - - if result.error: - continue - - # Register subdirectory link - store.register_subdir( - name=result.source_path.name, - index_path=result.index_path, - files_count=result.files_count, - direct_files=result.files_count, - ) - self.logger.debug( - "Linked %s to parent %s", - result.source_path.name, - parent_path, - ) - - store.update_merkle_root() - - except Exception as exc: - self.logger.error( - "Failed to link children to %s: %s", parent_path, exc - ) - - def _iter_source_files( - self, dir_path: Path, languages: List[str] = None, source_root: Optional[Path] = None - ) -> List[Path]: - """Iterate source files in directory (non-recursive). - - Returns files in the specified directory that match language filters. - Does not recurse into subdirectories. - - Args: - dir_path: Directory to scan - languages: Optional language filter - - Returns: - List of source file paths - """ - files: List[Path] = [] - - if not dir_path.is_dir(): - return files - - for item in dir_path.iterdir(): - if not item.is_file(): - continue - - if item.name.startswith("."): - continue - - if self._is_filtered_file(item, source_root=source_root): - continue - - # Check language support - language_id = self.config.language_for_path(item) - if not language_id: - continue - - # Apply language filter - if languages and language_id not in languages: - continue - - files.append(item) - - return files - - -def _normalize_relationship_target(target: str) -> str: - """Best-effort normalization of a relationship target into a local symbol name.""" - target = (target or "").strip() - if not target: - return "" - - # Drop trailing call parentheses when present (e.g., "foo()" -> "foo"). - if target.endswith("()"): - target = target[:-2] - - # Keep the leaf identifier for common qualified formats. - for sep in ("::", ".", "#"): - if sep in target: - target = target.split(sep)[-1] - - # Strip non-identifier suffix/prefix noise. - target = re.sub(r"^[^A-Za-z0-9_]+", "", target) - target = re.sub(r"[^A-Za-z0-9_]+$", "", target) - return target - - -def _compute_graph_neighbors( - store: DirIndexStore, - *, - max_depth: int = 2, - logger: Optional[logging.Logger] = None, -) -> None: - """Compute and persist N-hop neighbors for all symbols in a directory index.""" - if max_depth <= 0: - return - - log = logger or logging.getLogger(__name__) - - with store._lock: - conn = store._get_connection() - conn.row_factory = sqlite3.Row - - # Ensure schema exists even for older databases pinned to the same user_version. - try: - from codexlens.storage.migrations.migration_007_add_graph_neighbors import upgrade - - upgrade(conn) - except Exception as exc: - log.debug("Graph neighbor schema ensure failed: %s", exc) - - cursor = conn.cursor() - - try: - cursor.execute("DELETE FROM graph_neighbors") - except sqlite3.Error: - # Table missing or schema mismatch; skip gracefully. - return - - try: - symbol_rows = cursor.execute( - "SELECT id, file_id, name FROM symbols" - ).fetchall() - rel_rows = cursor.execute( - "SELECT source_symbol_id, target_qualified_name FROM code_relationships" - ).fetchall() - except sqlite3.Error: - return - - if not symbol_rows or not rel_rows: - try: - conn.commit() - except sqlite3.Error: - pass - return - - symbol_file_by_id: Dict[int, int] = {} - symbols_by_file_and_name: Dict[Tuple[int, str], List[int]] = {} - symbols_by_name: Dict[str, List[int]] = {} - - for row in symbol_rows: - symbol_id = int(row["id"]) - file_id = int(row["file_id"]) - name = str(row["name"]) - symbol_file_by_id[symbol_id] = file_id - symbols_by_file_and_name.setdefault((file_id, name), []).append(symbol_id) - symbols_by_name.setdefault(name, []).append(symbol_id) - - adjacency: Dict[int, Set[int]] = {} - - for row in rel_rows: - source_id = int(row["source_symbol_id"]) - target_raw = str(row["target_qualified_name"] or "") - target_name = _normalize_relationship_target(target_raw) - if not target_name: - continue - - source_file_id = symbol_file_by_id.get(source_id) - if source_file_id is None: - continue - - candidate_ids = symbols_by_file_and_name.get((source_file_id, target_name)) - if not candidate_ids: - global_candidates = symbols_by_name.get(target_name, []) - # Only resolve cross-file by name when unambiguous. - candidate_ids = global_candidates if len(global_candidates) == 1 else [] - - for target_id in candidate_ids: - if target_id == source_id: - continue - adjacency.setdefault(source_id, set()).add(target_id) - adjacency.setdefault(target_id, set()).add(source_id) - - if not adjacency: - try: - conn.commit() - except sqlite3.Error: - pass - return - - insert_rows: List[Tuple[int, int, int]] = [] - max_depth = min(int(max_depth), 2) - - for source_id, first_hop in adjacency.items(): - if not first_hop: - continue - for neighbor_id in first_hop: - insert_rows.append((source_id, neighbor_id, 1)) - - if max_depth < 2: - continue - - second_hop: Set[int] = set() - for neighbor_id in first_hop: - second_hop.update(adjacency.get(neighbor_id, set())) - - second_hop.discard(source_id) - second_hop.difference_update(first_hop) - - for neighbor_id in second_hop: - insert_rows.append((source_id, neighbor_id, 2)) - - if not insert_rows: - try: - conn.commit() - except sqlite3.Error: - pass - return - - try: - cursor.executemany( - """ - INSERT INTO graph_neighbors( - source_symbol_id, neighbor_symbol_id, relationship_depth - ) - VALUES(?, ?, ?) - """, - insert_rows, - ) - conn.commit() - except sqlite3.Error: - return - - -# === Worker Function for ProcessPoolExecutor === - - -def _matches_path_patterns(path: Path, patterns: List[str], source_root: Optional[Path] = None) -> bool: - rel_path: Optional[str] = None - if source_root is not None: - try: - rel_path = path.relative_to(source_root).as_posix() - except ValueError: - rel_path = None - - for pattern in patterns: - normalized = str(pattern).strip().replace('\\', '/').rstrip('/') - if not normalized: - continue - if normalized == path.name or fnmatch.fnmatch(path.name, normalized): - return True - if rel_path and (normalized == rel_path or fnmatch.fnmatch(rel_path, normalized)): - return True - return False - - -def _matches_ignore_patterns(path: Path, patterns: List[str], source_root: Optional[Path] = None) -> bool: - if path.name.startswith('.'): - return True - return _matches_path_patterns(path, patterns, source_root) - - -def _matches_extension_filters(path: Path, patterns: List[str], source_root: Optional[Path] = None) -> bool: - if not patterns: - return False - return _matches_path_patterns(path, patterns, source_root) - - -def _reset_index_db_files(index_db_path: Path) -> None: - """Best-effort removal of a directory index DB and common SQLite sidecars.""" - for suffix in ("", "-wal", "-shm", "-journal"): - target = Path(f"{index_db_path}{suffix}") if suffix else index_db_path - try: - target.unlink() - except FileNotFoundError: - continue - except OSError: - continue - - -def _build_dir_worker(args: tuple) -> DirBuildResult: - """Worker function for parallel directory building. - - Must be at module level for ProcessPoolExecutor pickling. - Reconstructs necessary objects from serializable arguments. - - Args: - args: Tuple of (dir_path, index_db_path, languages, config_dict, project_id, global_index_db_path, source_root) - - Returns: - DirBuildResult for the directory - """ - dir_path, index_db_path, languages, config_dict, project_id, global_index_db_path, source_root = args - - # Reconstruct config - config = Config( - data_dir=Path(config_dict["data_dir"]), - supported_languages=config_dict["supported_languages"], - parsing_rules=config_dict["parsing_rules"], - global_symbol_index_enabled=bool(config_dict.get("global_symbol_index_enabled", True)), - static_graph_enabled=bool(config_dict.get("static_graph_enabled", False)), - static_graph_relationship_types=list(config_dict.get("static_graph_relationship_types", ["imports", "inherits"])), - use_astgrep=bool(config_dict.get("use_astgrep", False)), - ignore_patterns=list(config_dict.get("ignore_patterns", [])), - extension_filters=list(config_dict.get("extension_filters", [])), - ) - - parser_factory = ParserFactory(config) - source_root_path = Path(source_root) if source_root else None - - global_index: GlobalSymbolIndex | None = None - try: - # Ensure index directory exists - index_db_path.parent.mkdir(parents=True, exist_ok=True) - - # Create directory index - if config.global_symbol_index_enabled and global_index_db_path: - global_index = GlobalSymbolIndex(Path(global_index_db_path), project_id=int(project_id)) - global_index.initialize() - - if not bool(config_dict.get("incremental", True)): - _reset_index_db_files(index_db_path) - - store = DirIndexStore(index_db_path, config=config, global_index=global_index) - store.initialize() - - files_count = 0 - symbols_count = 0 - - # Index files in this directory - for item in dir_path.iterdir(): - if not item.is_file(): - continue - - if item.name.startswith("."): - continue - - if _matches_extension_filters(item, config.extension_filters, source_root_path): - continue - - language_id = config.language_for_path(item) - if not language_id: - continue - - if languages and language_id not in languages: - continue - - try: - text = item.read_text(encoding="utf-8", errors="ignore") - parser = parser_factory.get_parser(language_id) - indexed_file = parser.parse(text, item) - - store.add_file( - name=item.name, - full_path=item, - content=text, - language=language_id, - symbols=indexed_file.symbols, - relationships=indexed_file.relationships, - ) - - # Write global relationships if enabled - if ( - config.static_graph_enabled - and global_index is not None - and indexed_file.relationships - ): - try: - allowed_types = config.static_graph_relationship_types - filtered_rels = [ - r for r in indexed_file.relationships - if r.relationship_type.value in allowed_types - ] - if filtered_rels: - global_index.update_file_relationships( - item, filtered_rels - ) - except Exception: - pass # Don't block indexing - - files_count += 1 - symbols_count += len(indexed_file.symbols) - - except Exception: - continue - - if files_count > 0: - _compute_graph_neighbors(store) - - # Get subdirectories - ignore_patterns = list(config_dict.get("ignore_patterns", [])) or list(DEFAULT_IGNORE_DIRS) - subdirs = [ - d.name - for d in dir_path.iterdir() - if d.is_dir() and not _matches_ignore_patterns(d, ignore_patterns, source_root_path) - ] - - store.update_merkle_root() - store.close() - if global_index is not None: - global_index.close() - - return DirBuildResult( - source_path=dir_path, - index_path=index_db_path, - files_count=files_count, - symbols_count=symbols_count, - subdirs=subdirs, - ) - - except Exception as exc: - if global_index is not None: - try: - global_index.close() - except Exception: - pass - return DirBuildResult( - source_path=dir_path, - index_path=index_db_path, - files_count=0, - symbols_count=0, - subdirs=[], - error=str(exc), - ) diff --git a/codex-lens/src/codexlens/storage/merkle_tree.py b/codex-lens/src/codexlens/storage/merkle_tree.py deleted file mode 100644 index c8c76988..00000000 --- a/codex-lens/src/codexlens/storage/merkle_tree.py +++ /dev/null @@ -1,136 +0,0 @@ -"""Merkle tree utilities for change detection. - -This module provides a generic, file-system based Merkle tree implementation -that can be used to efficiently diff directory states. -""" - -from __future__ import annotations - -import hashlib -from dataclasses import dataclass, field -from pathlib import Path -from typing import Dict, Iterable, List, Optional - - -def sha256_bytes(data: bytes) -> str: - return hashlib.sha256(data).hexdigest() - - -def sha256_text(text: str) -> str: - return sha256_bytes(text.encode("utf-8", errors="ignore")) - - -@dataclass -class MerkleNode: - """A Merkle node representing either a file (leaf) or directory (internal).""" - - name: str - rel_path: str - hash: str - is_dir: bool - children: Dict[str, "MerkleNode"] = field(default_factory=dict) - - def iter_files(self) -> Iterable["MerkleNode"]: - if not self.is_dir: - yield self - return - for child in self.children.values(): - yield from child.iter_files() - - -@dataclass -class MerkleTree: - """Merkle tree for a directory snapshot.""" - - root: MerkleNode - - @classmethod - def build_from_directory(cls, root_dir: Path) -> "MerkleTree": - root_dir = Path(root_dir).resolve() - node = cls._build_node(root_dir, base=root_dir) - return cls(root=node) - - @classmethod - def _build_node(cls, path: Path, *, base: Path) -> MerkleNode: - if path.is_file(): - rel = str(path.relative_to(base)).replace("\\", "/") - return MerkleNode( - name=path.name, - rel_path=rel, - hash=sha256_bytes(path.read_bytes()), - is_dir=False, - ) - - if not path.is_dir(): - rel = str(path.relative_to(base)).replace("\\", "/") - return MerkleNode(name=path.name, rel_path=rel, hash="", is_dir=False) - - children: Dict[str, MerkleNode] = {} - for child in sorted(path.iterdir(), key=lambda p: p.name): - child_node = cls._build_node(child, base=base) - children[child_node.name] = child_node - - items = [ - f"{'d' if n.is_dir else 'f'}:{name}:{n.hash}" - for name, n in sorted(children.items(), key=lambda kv: kv[0]) - ] - dir_hash = sha256_text("\n".join(items)) - - rel_path = "." if path == base else str(path.relative_to(base)).replace("\\", "/") - return MerkleNode( - name="." if path == base else path.name, - rel_path=rel_path, - hash=dir_hash, - is_dir=True, - children=children, - ) - - @staticmethod - def find_changed_files(old: Optional["MerkleTree"], new: Optional["MerkleTree"]) -> List[str]: - """Find changed/added/removed files between two trees. - - Returns: - List of relative file paths (POSIX-style separators). - """ - if old is None and new is None: - return [] - if old is None: - return sorted({n.rel_path for n in new.root.iter_files()}) # type: ignore[union-attr] - if new is None: - return sorted({n.rel_path for n in old.root.iter_files()}) - - changed: set[str] = set() - - def walk(old_node: Optional[MerkleNode], new_node: Optional[MerkleNode]) -> None: - if old_node is None and new_node is None: - return - - if old_node is None and new_node is not None: - changed.update(n.rel_path for n in new_node.iter_files()) - return - - if new_node is None and old_node is not None: - changed.update(n.rel_path for n in old_node.iter_files()) - return - - assert old_node is not None and new_node is not None - - if old_node.hash == new_node.hash: - return - - if not old_node.is_dir and not new_node.is_dir: - changed.add(new_node.rel_path) - return - - if old_node.is_dir != new_node.is_dir: - changed.update(n.rel_path for n in old_node.iter_files()) - changed.update(n.rel_path for n in new_node.iter_files()) - return - - names = set(old_node.children.keys()) | set(new_node.children.keys()) - for name in names: - walk(old_node.children.get(name), new_node.children.get(name)) - - walk(old.root, new.root) - return sorted(changed) - diff --git a/codex-lens/src/codexlens/storage/migration_manager.py b/codex-lens/src/codexlens/storage/migration_manager.py deleted file mode 100644 index d8690806..00000000 --- a/codex-lens/src/codexlens/storage/migration_manager.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -Manages database schema migrations. - -This module provides a framework for applying versioned migrations to the SQLite -database. Migrations are discovered from the `codexlens.storage.migrations` -package and applied sequentially. The database schema version is tracked using -the `user_version` pragma. -""" - -import importlib -import logging -import pkgutil -from pathlib import Path -from sqlite3 import Connection -from typing import List, NamedTuple - -log = logging.getLogger(__name__) - - -class Migration(NamedTuple): - """Represents a single database migration.""" - - version: int - name: str - upgrade: callable - - -def discover_migrations() -> List[Migration]: - """ - Discovers and returns a sorted list of database migrations. - - Migrations are expected to be in the `codexlens.storage.migrations` package, - with filenames in the format `migration_XXX_description.py`, where XXX is - the version number. Each migration module must contain an `upgrade` function - that takes a `sqlite3.Connection` object as its argument. - - Returns: - A list of Migration objects, sorted by version. - """ - import codexlens.storage.migrations - - migrations = [] - package_path = Path(codexlens.storage.migrations.__file__).parent - - for _, name, _ in pkgutil.iter_modules([str(package_path)]): - if name.startswith("migration_"): - try: - version = int(name.split("_")[1]) - module = importlib.import_module(f"codexlens.storage.migrations.{name}") - if hasattr(module, "upgrade"): - migrations.append( - Migration(version=version, name=name, upgrade=module.upgrade) - ) - else: - log.warning(f"Migration {name} is missing 'upgrade' function.") - except (ValueError, IndexError) as e: - log.warning(f"Could not parse migration name {name}: {e}") - except ImportError as e: - log.warning(f"Could not import migration {name}: {e}") - - migrations.sort(key=lambda m: m.version) - return migrations - - -class MigrationManager: - """ - Manages the application of migrations to a database. - """ - - def __init__(self, db_conn: Connection): - """ - Initializes the MigrationManager. - - Args: - db_conn: The SQLite database connection. - """ - self.db_conn = db_conn - self.migrations = discover_migrations() - - def get_current_version(self) -> int: - """ - Gets the current version of the database schema. - - Returns: - The current schema version number. - """ - return self.db_conn.execute("PRAGMA user_version").fetchone()[0] - - def set_version(self, version: int): - """ - Sets the database schema version. - - Args: - version: The version number to set. - """ - self.db_conn.execute(f"PRAGMA user_version = {version}") - log.info(f"Database schema version set to {version}") - - def apply_migrations(self): - """ - Applies all pending migrations to the database. - - This method checks the current database version and applies all - subsequent migrations in order. Each migration is applied within - a transaction, unless the migration manages its own transactions. - """ - current_version = self.get_current_version() - log.info(f"Current database schema version: {current_version}") - - for migration in self.migrations: - if migration.version > current_version: - log.info(f"Applying migration {migration.version}: {migration.name}...") - try: - # Check if a transaction is already in progress - in_transaction = self.db_conn.in_transaction - - # Only start transaction if not already in one - if not in_transaction: - self.db_conn.execute("BEGIN") - - migration.upgrade(self.db_conn) - self.set_version(migration.version) - - # Only commit if we started the transaction and it's still active - if not in_transaction and self.db_conn.in_transaction: - self.db_conn.execute("COMMIT") - - log.info( - f"Successfully applied migration {migration.version}: {migration.name}" - ) - except Exception as e: - log.error( - f"Failed to apply migration {migration.version}: {migration.name}. Error: {e}", - exc_info=True, - ) - # Try to rollback if transaction is active - try: - if self.db_conn.in_transaction: - self.db_conn.execute("ROLLBACK") - except Exception: - pass # Ignore rollback errors - raise - - latest_migration_version = self.migrations[-1].version if self.migrations else 0 - if current_version < latest_migration_version: - # This case can be hit if migrations were applied but the loop was exited - # and set_version was not called for the last one for some reason. - # To be safe, we explicitly set the version to the latest known migration. - final_version = self.get_current_version() - if final_version != latest_migration_version: - log.warning(f"Database version ({final_version}) is not the latest migration version ({latest_migration_version}). This may indicate a problem.") - - log.info("All pending migrations applied successfully.") - diff --git a/codex-lens/src/codexlens/storage/migrations/__init__.py b/codex-lens/src/codexlens/storage/migrations/__init__.py deleted file mode 100644 index 06e14729..00000000 --- a/codex-lens/src/codexlens/storage/migrations/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# This file makes the 'migrations' directory a Python package. diff --git a/codex-lens/src/codexlens/storage/migrations/migration_001_normalize_keywords.py b/codex-lens/src/codexlens/storage/migrations/migration_001_normalize_keywords.py deleted file mode 100644 index 97df06fd..00000000 --- a/codex-lens/src/codexlens/storage/migrations/migration_001_normalize_keywords.py +++ /dev/null @@ -1,123 +0,0 @@ -""" -Migration 001: Normalize keywords into separate tables. - -This migration introduces two new tables, `keywords` and `file_keywords`, to -store semantic keywords in a normalized fashion. It then migrates the existing -keywords from the `semantic_data` JSON blob in the `files` table into these -new tables. This is intended to speed up keyword-based searches significantly. -""" - -import json -import logging -from sqlite3 import Connection - -log = logging.getLogger(__name__) - - -def upgrade(db_conn: Connection): - """ - Applies the migration to normalize keywords. - - - Creates `keywords` and `file_keywords` tables. - - Creates indexes for efficient querying. - - Migrates data from `files.semantic_data` to the new tables. - - Args: - db_conn: The SQLite database connection. - """ - cursor = db_conn.cursor() - - log.info("Creating 'keywords' and 'file_keywords' tables...") - # Create a table to store unique keywords - cursor.execute( - """ - CREATE TABLE IF NOT EXISTS keywords ( - id INTEGER PRIMARY KEY, - keyword TEXT NOT NULL UNIQUE - ) - """ - ) - - # Create a join table to link files and keywords (many-to-many) - cursor.execute( - """ - CREATE TABLE IF NOT EXISTS file_keywords ( - file_id INTEGER NOT NULL, - keyword_id INTEGER NOT NULL, - PRIMARY KEY (file_id, keyword_id), - FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE, - FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE - ) - """ - ) - - log.info("Creating indexes for new keyword tables...") - cursor.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords (keyword)") - cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords (file_id)") - cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_keyword_id ON file_keywords (keyword_id)") - - log.info("Migrating existing keywords from 'semantic_metadata' table...") - - # Check if semantic_metadata table exists before querying - cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'") - if not cursor.fetchone(): - log.info("No 'semantic_metadata' table found, skipping data migration.") - return - - # Check if 'keywords' column exists in semantic_metadata table - # (current schema may already use normalized tables without this column) - cursor.execute("PRAGMA table_info(semantic_metadata)") - columns = {row[1] for row in cursor.fetchall()} - if "keywords" not in columns: - log.info("No 'keywords' column in semantic_metadata table, skipping data migration.") - return - - cursor.execute("SELECT file_id, keywords FROM semantic_metadata WHERE keywords IS NOT NULL AND keywords != ''") - - files_to_migrate = cursor.fetchall() - if not files_to_migrate: - log.info("No existing files with semantic metadata to migrate.") - return - - log.info(f"Found {len(files_to_migrate)} files with semantic metadata to migrate.") - - for file_id, keywords_json in files_to_migrate: - if not keywords_json: - continue - try: - keywords = json.loads(keywords_json) - - if not isinstance(keywords, list): - log.warning(f"Keywords for file_id {file_id} is not a list, skipping.") - continue - - for keyword in keywords: - if not isinstance(keyword, str): - log.warning(f"Non-string keyword '{keyword}' found for file_id {file_id}, skipping.") - continue - - keyword = keyword.strip() - if not keyword: - continue - - # Get or create keyword_id - cursor.execute("INSERT OR IGNORE INTO keywords (keyword) VALUES (?)", (keyword,)) - cursor.execute("SELECT id FROM keywords WHERE keyword = ?", (keyword,)) - keyword_id_result = cursor.fetchone() - - if keyword_id_result: - keyword_id = keyword_id_result[0] - # Link file to keyword - cursor.execute( - "INSERT OR IGNORE INTO file_keywords (file_id, keyword_id) VALUES (?, ?)", - (file_id, keyword_id), - ) - else: - log.error(f"Failed to retrieve or create keyword_id for keyword: {keyword}") - - except json.JSONDecodeError as e: - log.warning(f"Could not parse keywords for file_id {file_id}: {e}") - except Exception as e: - log.error(f"An unexpected error occurred during migration for file_id {file_id}: {e}", exc_info=True) - - log.info("Finished migrating keywords.") diff --git a/codex-lens/src/codexlens/storage/migrations/migration_002_add_token_metadata.py b/codex-lens/src/codexlens/storage/migrations/migration_002_add_token_metadata.py deleted file mode 100644 index daa3085e..00000000 --- a/codex-lens/src/codexlens/storage/migrations/migration_002_add_token_metadata.py +++ /dev/null @@ -1,48 +0,0 @@ -""" -Migration 002: Add token_count and symbol_type to symbols table. - -This migration adds token counting metadata to symbols for accurate chunk -splitting and performance optimization. It also adds symbol_type for better -filtering in searches. -""" - -import logging -from sqlite3 import Connection - -log = logging.getLogger(__name__) - - -def upgrade(db_conn: Connection): - """ - Applies the migration to add token metadata to symbols. - - - Adds token_count column to symbols table - - Adds symbol_type column to symbols table (for future use) - - Creates index on symbol_type for efficient filtering - - Backfills existing symbols with NULL token_count (to be calculated lazily) - - Args: - db_conn: The SQLite database connection. - """ - cursor = db_conn.cursor() - - log.info("Adding token_count column to symbols table...") - try: - cursor.execute("ALTER TABLE symbols ADD COLUMN token_count INTEGER") - log.info("Successfully added token_count column.") - except Exception as e: - # Column might already exist - log.warning(f"Could not add token_count column (might already exist): {e}") - - log.info("Adding symbol_type column to symbols table...") - try: - cursor.execute("ALTER TABLE symbols ADD COLUMN symbol_type TEXT") - log.info("Successfully added symbol_type column.") - except Exception as e: - # Column might already exist - log.warning(f"Could not add symbol_type column (might already exist): {e}") - - log.info("Creating index on symbol_type for efficient filtering...") - cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_type ON symbols(symbol_type)") - - log.info("Migration 002 completed successfully.") diff --git a/codex-lens/src/codexlens/storage/migrations/migration_004_dual_fts.py b/codex-lens/src/codexlens/storage/migrations/migration_004_dual_fts.py deleted file mode 100644 index 502e067d..00000000 --- a/codex-lens/src/codexlens/storage/migrations/migration_004_dual_fts.py +++ /dev/null @@ -1,232 +0,0 @@ -""" -Migration 004: Add dual FTS tables for exact and fuzzy matching. - -This migration introduces two FTS5 tables: -- files_fts_exact: Uses unicode61 tokenizer for exact token matching -- files_fts_fuzzy: Uses trigram tokenizer (or extended unicode61) for substring/fuzzy matching - -Both tables are synchronized with the files table via triggers for automatic updates. -""" - -import logging -from sqlite3 import Connection - -from codexlens.storage.sqlite_utils import check_trigram_support, get_sqlite_version - -log = logging.getLogger(__name__) - - -def upgrade(db_conn: Connection): - """ - Applies the migration to add dual FTS tables. - - - Drops old files_fts table and triggers - - Creates files_fts_exact with unicode61 tokenizer - - Creates files_fts_fuzzy with trigram or extended unicode61 tokenizer - - Creates synchronized triggers for both tables - - Rebuilds FTS indexes from files table - - Args: - db_conn: The SQLite database connection. - """ - cursor = db_conn.cursor() - - try: - # Check trigram support - has_trigram = check_trigram_support(db_conn) - version = get_sqlite_version(db_conn) - log.info(f"SQLite version: {'.'.join(map(str, version))}") - - if has_trigram: - log.info("Trigram tokenizer available, using for fuzzy FTS table") - fuzzy_tokenizer = "trigram" - else: - log.warning( - f"Trigram tokenizer not available (requires SQLite >= 3.34), " - f"using extended unicode61 tokenizer for fuzzy matching" - ) - fuzzy_tokenizer = "unicode61 tokenchars '_-.'" - - # Start transaction - cursor.execute("BEGIN TRANSACTION") - - # Check if files table has 'name' column (v2 schema doesn't have it) - cursor.execute("PRAGMA table_info(files)") - columns = {row[1] for row in cursor.fetchall()} - - if 'name' not in columns: - log.info("Adding 'name' column to files table (v2 schema upgrade)...") - # Add name column - cursor.execute("ALTER TABLE files ADD COLUMN name TEXT") - # Populate name from path (extract filename from last '/') - # Use Python to do the extraction since SQLite doesn't have reverse() - cursor.execute("SELECT rowid, path FROM files") - rows = cursor.fetchall() - for rowid, path in rows: - # Extract filename from path - name = path.split('/')[-1] if '/' in path else path - cursor.execute("UPDATE files SET name = ? WHERE rowid = ?", (name, rowid)) - - # Rename 'path' column to 'full_path' if needed - if 'path' in columns and 'full_path' not in columns: - log.info("Renaming 'path' to 'full_path' (v2 schema upgrade)...") - # Check if indexed_at column exists in v2 schema - has_indexed_at = 'indexed_at' in columns - has_mtime = 'mtime' in columns - - # SQLite doesn't support RENAME COLUMN before 3.25, so use table recreation - cursor.execute(""" - CREATE TABLE files_new ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - name TEXT NOT NULL, - full_path TEXT NOT NULL UNIQUE, - content TEXT, - language TEXT, - mtime REAL, - indexed_at TEXT - ) - """) - - # Build INSERT statement based on available columns - # Note: v2 schema has no rowid (path is PRIMARY KEY), so use NULL for AUTOINCREMENT - if has_indexed_at and has_mtime: - cursor.execute(""" - INSERT INTO files_new (name, full_path, content, language, mtime, indexed_at) - SELECT name, path, content, language, mtime, indexed_at FROM files - """) - elif has_indexed_at: - cursor.execute(""" - INSERT INTO files_new (name, full_path, content, language, indexed_at) - SELECT name, path, content, language, indexed_at FROM files - """) - elif has_mtime: - cursor.execute(""" - INSERT INTO files_new (name, full_path, content, language, mtime) - SELECT name, path, content, language, mtime FROM files - """) - else: - cursor.execute(""" - INSERT INTO files_new (name, full_path, content, language) - SELECT name, path, content, language FROM files - """) - - cursor.execute("DROP TABLE files") - cursor.execute("ALTER TABLE files_new RENAME TO files") - - log.info("Dropping old FTS triggers and table...") - # Drop old triggers - cursor.execute("DROP TRIGGER IF EXISTS files_ai") - cursor.execute("DROP TRIGGER IF EXISTS files_ad") - cursor.execute("DROP TRIGGER IF EXISTS files_au") - - # Drop old FTS table - cursor.execute("DROP TABLE IF EXISTS files_fts") - - # Create exact FTS table (unicode61 with underscores/hyphens/dots as token chars) - # Note: tokenchars includes '.' to properly tokenize qualified names like PortRole.FLOW - log.info("Creating files_fts_exact table with unicode61 tokenizer...") - cursor.execute( - """ - CREATE VIRTUAL TABLE files_fts_exact USING fts5( - name, full_path UNINDEXED, content, - content='files', - content_rowid='id', - tokenize="unicode61 tokenchars '_-.'" - ) - """ - ) - - # Create fuzzy FTS table (trigram or extended unicode61) - log.info(f"Creating files_fts_fuzzy table with {fuzzy_tokenizer} tokenizer...") - cursor.execute( - f""" - CREATE VIRTUAL TABLE files_fts_fuzzy USING fts5( - name, full_path UNINDEXED, content, - content='files', - content_rowid='id', - tokenize="{fuzzy_tokenizer}" - ) - """ - ) - - # Create synchronized triggers for files_fts_exact - log.info("Creating triggers for files_fts_exact...") - cursor.execute( - """ - CREATE TRIGGER files_exact_ai AFTER INSERT ON files BEGIN - INSERT INTO files_fts_exact(rowid, name, full_path, content) - VALUES(new.id, new.name, new.full_path, new.content); - END - """ - ) - cursor.execute( - """ - CREATE TRIGGER files_exact_ad AFTER DELETE ON files BEGIN - INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content) - VALUES('delete', old.id, old.name, old.full_path, old.content); - END - """ - ) - cursor.execute( - """ - CREATE TRIGGER files_exact_au AFTER UPDATE ON files BEGIN - INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content) - VALUES('delete', old.id, old.name, old.full_path, old.content); - INSERT INTO files_fts_exact(rowid, name, full_path, content) - VALUES(new.id, new.name, new.full_path, new.content); - END - """ - ) - - # Create synchronized triggers for files_fts_fuzzy - log.info("Creating triggers for files_fts_fuzzy...") - cursor.execute( - """ - CREATE TRIGGER files_fuzzy_ai AFTER INSERT ON files BEGIN - INSERT INTO files_fts_fuzzy(rowid, name, full_path, content) - VALUES(new.id, new.name, new.full_path, new.content); - END - """ - ) - cursor.execute( - """ - CREATE TRIGGER files_fuzzy_ad AFTER DELETE ON files BEGIN - INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content) - VALUES('delete', old.id, old.name, old.full_path, old.content); - END - """ - ) - cursor.execute( - """ - CREATE TRIGGER files_fuzzy_au AFTER UPDATE ON files BEGIN - INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content) - VALUES('delete', old.id, old.name, old.full_path, old.content); - INSERT INTO files_fts_fuzzy(rowid, name, full_path, content) - VALUES(new.id, new.name, new.full_path, new.content); - END - """ - ) - - # Rebuild FTS indexes from files table - log.info("Rebuilding FTS indexes from files table...") - cursor.execute("INSERT INTO files_fts_exact(files_fts_exact) VALUES('rebuild')") - cursor.execute("INSERT INTO files_fts_fuzzy(files_fts_fuzzy) VALUES('rebuild')") - - # Commit transaction - cursor.execute("COMMIT") - log.info("Migration 004 completed successfully") - - # Vacuum to reclaim space (outside transaction) - try: - log.info("Running VACUUM to reclaim space...") - cursor.execute("VACUUM") - except Exception as e: - log.warning(f"VACUUM failed (non-critical): {e}") - - except Exception as e: - log.error(f"Migration 004 failed: {e}") - try: - cursor.execute("ROLLBACK") - except Exception: - pass - raise diff --git a/codex-lens/src/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py b/codex-lens/src/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py deleted file mode 100644 index 918bf17a..00000000 --- a/codex-lens/src/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py +++ /dev/null @@ -1,196 +0,0 @@ -""" -Migration 005: Remove unused and redundant database fields. - -This migration removes four problematic fields identified by Gemini analysis: - -1. **semantic_metadata.keywords** (deprecated - replaced by file_keywords table) - - Data: Migrated to normalized file_keywords table in migration 001 - - Impact: Column now redundant, remove to prevent sync issues - -2. **symbols.token_count** (unused - always NULL) - - Data: Never populated, always NULL - - Impact: No data loss, just removes unused column - -3. **symbols.symbol_type** (redundant - duplicates kind) - - Data: Redundant with symbols.kind field - - Impact: No data loss, kind field contains same information - -4. **subdirs.direct_files** (unused - never displayed) - - Data: Never used in queries or display logic - - Impact: No data loss, just removes unused column - -Schema changes use table recreation pattern (SQLite best practice): -- Create new table without deprecated columns -- Copy data from old table -- Drop old table -- Rename new table -- Recreate indexes -""" - -import logging -from sqlite3 import Connection - -log = logging.getLogger(__name__) - - -def upgrade(db_conn: Connection): - """Remove unused and redundant fields from schema. - - Note: Transaction management is handled by MigrationManager. - This migration should NOT start its own transaction. - - Args: - db_conn: The SQLite database connection. - """ - cursor = db_conn.cursor() - - # Step 1: Remove semantic_metadata.keywords (if column exists) - log.info("Checking semantic_metadata.keywords column...") - - cursor.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'" - ) - if cursor.fetchone(): - # Check if keywords column exists - cursor.execute("PRAGMA table_info(semantic_metadata)") - columns = {row[1] for row in cursor.fetchall()} - - if "keywords" in columns: - log.info("Removing semantic_metadata.keywords column...") - cursor.execute(""" - CREATE TABLE semantic_metadata_new ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - file_id INTEGER NOT NULL UNIQUE, - summary TEXT, - purpose TEXT, - llm_tool TEXT, - generated_at REAL, - FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE - ) - """) - - cursor.execute(""" - INSERT INTO semantic_metadata_new (id, file_id, summary, purpose, llm_tool, generated_at) - SELECT id, file_id, summary, purpose, llm_tool, generated_at - FROM semantic_metadata - """) - - cursor.execute("DROP TABLE semantic_metadata") - cursor.execute("ALTER TABLE semantic_metadata_new RENAME TO semantic_metadata") - - # Recreate index - cursor.execute( - "CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)" - ) - log.info("Removed semantic_metadata.keywords column") - else: - log.info("semantic_metadata.keywords column does not exist, skipping") - else: - log.info("semantic_metadata table does not exist, skipping") - - # Step 2: Remove symbols.token_count and symbols.symbol_type (if columns exist) - log.info("Checking symbols.token_count and symbols.symbol_type columns...") - - cursor.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='symbols'" - ) - if cursor.fetchone(): - # Check if token_count or symbol_type columns exist - cursor.execute("PRAGMA table_info(symbols)") - columns = {row[1] for row in cursor.fetchall()} - - if "token_count" in columns or "symbol_type" in columns: - log.info("Removing symbols.token_count and symbols.symbol_type columns...") - cursor.execute(""" - CREATE TABLE symbols_new ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - file_id INTEGER NOT NULL, - name TEXT NOT NULL, - kind TEXT, - start_line INTEGER, - end_line INTEGER, - FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE - ) - """) - - cursor.execute(""" - INSERT INTO symbols_new (id, file_id, name, kind, start_line, end_line) - SELECT id, file_id, name, kind, start_line, end_line - FROM symbols - """) - - cursor.execute("DROP TABLE symbols") - cursor.execute("ALTER TABLE symbols_new RENAME TO symbols") - - # Recreate indexes - cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)") - cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)") - log.info("Removed symbols.token_count and symbols.symbol_type columns") - else: - log.info("symbols.token_count/symbol_type columns do not exist, skipping") - else: - log.info("symbols table does not exist, skipping") - - # Step 3: Remove subdirs.direct_files (if column exists) - log.info("Checking subdirs.direct_files column...") - - cursor.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='subdirs'" - ) - if cursor.fetchone(): - # Check if direct_files column exists - cursor.execute("PRAGMA table_info(subdirs)") - columns = {row[1] for row in cursor.fetchall()} - - if "direct_files" in columns: - log.info("Removing subdirs.direct_files column...") - cursor.execute(""" - CREATE TABLE subdirs_new ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - name TEXT NOT NULL UNIQUE, - index_path TEXT NOT NULL, - files_count INTEGER DEFAULT 0, - last_updated REAL - ) - """) - - cursor.execute(""" - INSERT INTO subdirs_new (id, name, index_path, files_count, last_updated) - SELECT id, name, index_path, files_count, last_updated - FROM subdirs - """) - - cursor.execute("DROP TABLE subdirs") - cursor.execute("ALTER TABLE subdirs_new RENAME TO subdirs") - - # Recreate index - cursor.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)") - log.info("Removed subdirs.direct_files column") - else: - log.info("subdirs.direct_files column does not exist, skipping") - else: - log.info("subdirs table does not exist, skipping") - - log.info("Migration 005 completed successfully") - - # Vacuum to reclaim space (outside transaction, optional) - # Note: VACUUM cannot run inside a transaction, so we skip it here - # The caller can run VACUUM separately if desired - - -def downgrade(db_conn: Connection): - """Restore removed fields (data will be lost for keywords, token_count, symbol_type, direct_files). - - This is a placeholder - true downgrade is not feasible as data is lost. - The migration is designed to be one-way since removed fields are unused/redundant. - - Args: - db_conn: The SQLite database connection. - """ - log.warning( - "Migration 005 downgrade not supported - removed fields are unused/redundant. " - "Data cannot be restored." - ) - raise NotImplementedError( - "Migration 005 downgrade not supported - this is a one-way migration" - ) diff --git a/codex-lens/src/codexlens/storage/migrations/migration_006_enhance_relationships.py b/codex-lens/src/codexlens/storage/migrations/migration_006_enhance_relationships.py deleted file mode 100644 index 2c7c6cd8..00000000 --- a/codex-lens/src/codexlens/storage/migrations/migration_006_enhance_relationships.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -Migration 006: Ensure relationship tables and indexes exist. - -This migration is intentionally idempotent. It creates the `code_relationships` -table (used for graph visualization) and its indexes if missing. -""" - -from __future__ import annotations - -import logging -from sqlite3 import Connection - -log = logging.getLogger(__name__) - - -def upgrade(db_conn: Connection) -> None: - cursor = db_conn.cursor() - - log.info("Ensuring code_relationships table exists...") - cursor.execute( - """ - CREATE TABLE IF NOT EXISTS code_relationships ( - id INTEGER PRIMARY KEY, - source_symbol_id INTEGER NOT NULL REFERENCES symbols (id) ON DELETE CASCADE, - target_qualified_name TEXT NOT NULL, - relationship_type TEXT NOT NULL, - source_line INTEGER NOT NULL, - target_file TEXT - ) - """ - ) - - log.info("Ensuring relationship indexes exist...") - cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)") - cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)") - cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_type ON code_relationships(relationship_type)") - diff --git a/codex-lens/src/codexlens/storage/migrations/migration_007_add_graph_neighbors.py b/codex-lens/src/codexlens/storage/migrations/migration_007_add_graph_neighbors.py deleted file mode 100644 index 83306886..00000000 --- a/codex-lens/src/codexlens/storage/migrations/migration_007_add_graph_neighbors.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Migration 007: Add precomputed graph neighbor table for search expansion. - -Adds: -- graph_neighbors: cached N-hop neighbors between symbols (keyed by symbol ids) - -This table is derived data (a cache) and is safe to rebuild at any time. -The migration is intentionally idempotent. -""" - -from __future__ import annotations - -import logging -from sqlite3 import Connection - -log = logging.getLogger(__name__) - - -def upgrade(db_conn: Connection) -> None: - cursor = db_conn.cursor() - - log.info("Creating graph_neighbors table...") - cursor.execute( - """ - CREATE TABLE IF NOT EXISTS graph_neighbors ( - source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE, - neighbor_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE, - relationship_depth INTEGER NOT NULL, - PRIMARY KEY (source_symbol_id, neighbor_symbol_id) - ) - """ - ) - - log.info("Creating indexes for graph_neighbors...") - cursor.execute( - """ - CREATE INDEX IF NOT EXISTS idx_graph_neighbors_source_depth - ON graph_neighbors(source_symbol_id, relationship_depth) - """ - ) - cursor.execute( - """ - CREATE INDEX IF NOT EXISTS idx_graph_neighbors_neighbor - ON graph_neighbors(neighbor_symbol_id) - """ - ) - diff --git a/codex-lens/src/codexlens/storage/migrations/migration_008_add_merkle_hashes.py b/codex-lens/src/codexlens/storage/migrations/migration_008_add_merkle_hashes.py deleted file mode 100644 index 092fc20a..00000000 --- a/codex-lens/src/codexlens/storage/migrations/migration_008_add_merkle_hashes.py +++ /dev/null @@ -1,81 +0,0 @@ -""" -Migration 008: Add Merkle hash tables for content-based incremental indexing. - -Adds: -- merkle_hashes: per-file SHA-256 hashes (keyed by file_id) -- merkle_state: directory-level root hash (single row, id=1) - -Backfills merkle_hashes using the existing `files.content` column when available. -""" - -from __future__ import annotations - -import hashlib -import logging -import time -from sqlite3 import Connection - -log = logging.getLogger(__name__) - - -def upgrade(db_conn: Connection) -> None: - cursor = db_conn.cursor() - - log.info("Creating merkle_hashes table...") - cursor.execute( - """ - CREATE TABLE IF NOT EXISTS merkle_hashes ( - file_id INTEGER PRIMARY KEY REFERENCES files(id) ON DELETE CASCADE, - sha256 TEXT NOT NULL, - updated_at REAL - ) - """ - ) - - log.info("Creating merkle_state table...") - cursor.execute( - """ - CREATE TABLE IF NOT EXISTS merkle_state ( - id INTEGER PRIMARY KEY CHECK (id = 1), - root_hash TEXT, - updated_at REAL - ) - """ - ) - - # Backfill file hashes from stored content (best-effort). - try: - rows = cursor.execute("SELECT id, content FROM files").fetchall() - except Exception as exc: - log.warning("Unable to backfill merkle hashes (files table missing?): %s", exc) - return - - now = time.time() - inserts: list[tuple[int, str, float]] = [] - - for row in rows: - file_id = int(row[0]) - content = row[1] - if content is None: - continue - try: - digest = hashlib.sha256(str(content).encode("utf-8", errors="ignore")).hexdigest() - inserts.append((file_id, digest, now)) - except Exception: - continue - - if not inserts: - return - - log.info("Backfilling %d file hashes...", len(inserts)) - cursor.executemany( - """ - INSERT INTO merkle_hashes(file_id, sha256, updated_at) - VALUES(?, ?, ?) - ON CONFLICT(file_id) DO UPDATE SET - sha256=excluded.sha256, - updated_at=excluded.updated_at - """, - inserts, - ) - diff --git a/codex-lens/src/codexlens/storage/migrations/migration_010_add_multi_vector_chunks.py b/codex-lens/src/codexlens/storage/migrations/migration_010_add_multi_vector_chunks.py deleted file mode 100644 index 9a937200..00000000 --- a/codex-lens/src/codexlens/storage/migrations/migration_010_add_multi_vector_chunks.py +++ /dev/null @@ -1,162 +0,0 @@ -""" -Migration 010: Add multi-vector storage support for cascade retrieval. - -This migration introduces the chunks table with multi-vector support: -- chunks: Stores code chunks with multiple embedding types - - embedding: Original embedding for backward compatibility - - embedding_binary: 256-dim binary vector for coarse ranking (fast) - - embedding_dense: 2048-dim dense vector for fine ranking (precise) - -The multi-vector architecture enables cascade retrieval: -1. First stage: Fast binary vector search for candidate retrieval -2. Second stage: Dense vector reranking for precision -""" - -import logging -from sqlite3 import Connection - -log = logging.getLogger(__name__) - - -def upgrade(db_conn: Connection) -> None: - """ - Adds chunks table with multi-vector embedding columns. - - Creates: - - chunks: Table for storing code chunks with multiple embedding types - - idx_chunks_file_path: Index for efficient file-based lookups - - Also migrates existing chunks tables by adding new columns if needed. - - Args: - db_conn: The SQLite database connection. - """ - cursor = db_conn.cursor() - - # Check if chunks table already exists - table_exists = cursor.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'" - ).fetchone() - - if table_exists: - # Migrate existing table - add new columns if missing - log.info("chunks table exists, checking for missing columns...") - - col_info = cursor.execute("PRAGMA table_info(chunks)").fetchall() - existing_columns = {row[1] for row in col_info} - - if "embedding_binary" not in existing_columns: - log.info("Adding embedding_binary column to chunks table...") - cursor.execute( - "ALTER TABLE chunks ADD COLUMN embedding_binary BLOB" - ) - - if "embedding_dense" not in existing_columns: - log.info("Adding embedding_dense column to chunks table...") - cursor.execute( - "ALTER TABLE chunks ADD COLUMN embedding_dense BLOB" - ) - else: - # Create new table with all columns - log.info("Creating chunks table with multi-vector support...") - cursor.execute( - """ - CREATE TABLE chunks ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - file_path TEXT NOT NULL, - content TEXT NOT NULL, - embedding BLOB, - embedding_binary BLOB, - embedding_dense BLOB, - metadata TEXT, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - """ - ) - - # Create index for file-based lookups - log.info("Creating index for chunks table...") - cursor.execute( - """ - CREATE INDEX IF NOT EXISTS idx_chunks_file_path - ON chunks(file_path) - """ - ) - - log.info("Migration 010 completed successfully") - - -def downgrade(db_conn: Connection) -> None: - """ - Removes multi-vector columns from chunks table. - - Note: This does not drop the chunks table entirely to preserve data. - Only the new columns added by this migration are removed. - - Args: - db_conn: The SQLite database connection. - """ - cursor = db_conn.cursor() - - log.info("Removing multi-vector columns from chunks table...") - - # SQLite doesn't support DROP COLUMN directly in older versions - # We need to recreate the table without the columns - - # Check if chunks table exists - table_exists = cursor.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'" - ).fetchone() - - if not table_exists: - log.info("chunks table does not exist, nothing to downgrade") - return - - # Check if the columns exist before trying to remove them - col_info = cursor.execute("PRAGMA table_info(chunks)").fetchall() - existing_columns = {row[1] for row in col_info} - - needs_migration = ( - "embedding_binary" in existing_columns or - "embedding_dense" in existing_columns - ) - - if not needs_migration: - log.info("Multi-vector columns not present, nothing to remove") - return - - # Recreate table without the new columns - log.info("Recreating chunks table without multi-vector columns...") - - cursor.execute( - """ - CREATE TABLE chunks_backup ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - file_path TEXT NOT NULL, - content TEXT NOT NULL, - embedding BLOB, - metadata TEXT, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - """ - ) - - cursor.execute( - """ - INSERT INTO chunks_backup (id, file_path, content, embedding, metadata, created_at) - SELECT id, file_path, content, embedding, metadata, created_at FROM chunks - """ - ) - - cursor.execute("DROP TABLE chunks") - cursor.execute("ALTER TABLE chunks_backup RENAME TO chunks") - - # Recreate index - cursor.execute( - """ - CREATE INDEX IF NOT EXISTS idx_chunks_file_path - ON chunks(file_path) - """ - ) - - log.info("Migration 010 downgrade completed successfully") diff --git a/codex-lens/src/codexlens/storage/path_mapper.py b/codex-lens/src/codexlens/storage/path_mapper.py deleted file mode 100644 index 2c238355..00000000 --- a/codex-lens/src/codexlens/storage/path_mapper.py +++ /dev/null @@ -1,300 +0,0 @@ -"""Path mapping utilities for source paths and index paths. - -This module provides bidirectional mapping between source code directories -and their corresponding index storage locations. - -Storage Structure: - ~/.codexlens/ - ├── registry.db # Global mapping table - └── indexes/ - └── D/ - └── Claude_dms3/ - ├── _index.db # Root directory index - └── src/ - └── _index.db # src/ directory index -""" - -import json -import os -import platform -from pathlib import Path -from typing import Optional - - -def _get_configured_index_root() -> Path: - """Get the index root from environment or config file. - - Priority order: - 1. CODEXLENS_INDEX_DIR environment variable - 2. index_dir from ~/.codexlens/config.json - 3. Default: ~/.codexlens/indexes - """ - env_override = os.getenv("CODEXLENS_INDEX_DIR") - if env_override: - return Path(env_override).expanduser().resolve() - - config_file = Path.home() / ".codexlens" / "config.json" - if config_file.exists(): - try: - cfg = json.loads(config_file.read_text(encoding="utf-8")) - if "index_dir" in cfg: - return Path(cfg["index_dir"]).expanduser().resolve() - except (json.JSONDecodeError, OSError): - pass - - return Path.home() / ".codexlens" / "indexes" - - -class PathMapper: - """Bidirectional mapping tool for source paths ↔ index paths. - - Handles cross-platform path normalization and conversion between - source code directories and their index storage locations. - - Attributes: - DEFAULT_INDEX_ROOT: Default root directory for all indexes - INDEX_DB_NAME: Standard name for index database files - index_root: Configured index root directory - """ - - DEFAULT_INDEX_ROOT = _get_configured_index_root() - INDEX_DB_NAME = "_index.db" - - def __init__(self, index_root: Optional[Path] = None): - """Initialize PathMapper with optional custom index root. - - Args: - index_root: Custom index root directory. If None, uses DEFAULT_INDEX_ROOT. - """ - self.index_root = (index_root or self.DEFAULT_INDEX_ROOT).resolve() - - def source_to_index_dir(self, source_path: Path) -> Path: - """Convert source directory to its index directory path. - - Maps a source code directory to where its index data should be stored. - The mapping preserves the directory structure but normalizes paths - for cross-platform compatibility. - - Args: - source_path: Source directory path to map - - Returns: - Index directory path under index_root - - Examples: - >>> mapper = PathMapper() - >>> mapper.source_to_index_dir(Path("D:/Claude_dms3/src")) - PosixPath('/home/user/.codexlens/indexes/D/Claude_dms3/src') - - >>> mapper.source_to_index_dir(Path("/home/user/project")) - PosixPath('/home/user/.codexlens/indexes/home/user/project') - """ - source_path = source_path.resolve() - normalized = self.normalize_path(source_path) - return self.index_root / normalized - - def source_to_index_db(self, source_path: Path) -> Path: - """Convert source directory to its index database file path. - - Maps a source directory to the full path of its index database file, - including the standard INDEX_DB_NAME. - - Args: - source_path: Source directory path to map - - Returns: - Full path to the index database file - - Examples: - >>> mapper = PathMapper() - >>> mapper.source_to_index_db(Path("D:/Claude_dms3/src")) - PosixPath('/home/user/.codexlens/indexes/D/Claude_dms3/src/_index.db') - """ - index_dir = self.source_to_index_dir(source_path) - return index_dir / self.INDEX_DB_NAME - - def index_to_source(self, index_path: Path) -> Path: - """Convert index path back to original source path. - - Performs reverse mapping from an index storage location to the - original source directory. Handles both directory paths and - database file paths. - - Args: - index_path: Index directory or database file path - - Returns: - Original source directory path - - Raises: - ValueError: If index_path is not under index_root - - Examples: - >>> mapper = PathMapper() - >>> mapper.index_to_source( - ... Path("~/.codexlens/indexes/D/Claude_dms3/src/_index.db") - ... ) - WindowsPath('D:/Claude_dms3/src') - - >>> mapper.index_to_source( - ... Path("~/.codexlens/indexes/D/Claude_dms3/src") - ... ) - WindowsPath('D:/Claude_dms3/src') - """ - index_path = index_path.resolve() - - # Remove _index.db if present - if index_path.name == self.INDEX_DB_NAME: - index_path = index_path.parent - - # Verify path is under index_root - try: - relative = index_path.relative_to(self.index_root) - except ValueError: - raise ValueError( - f"Index path {index_path} is not under index root {self.index_root}" - ) - - # Convert normalized path back to source path - normalized_str = str(relative).replace("\\", "/") - return self.denormalize_path(normalized_str) - - def get_project_root(self, source_path: Path) -> Path: - """Find the project root directory (topmost indexed directory). - - Walks up the directory tree to find the highest-level directory - that has an index database. - - Args: - source_path: Source directory to start from - - Returns: - Project root directory path. Returns source_path itself if - no parent index is found. - - Examples: - >>> mapper = PathMapper() - >>> mapper.get_project_root(Path("D:/Claude_dms3/src/codexlens")) - WindowsPath('D:/Claude_dms3') - """ - source_path = source_path.resolve() - current = source_path - project_root = source_path - - # Walk up the tree - while current.parent != current: # Stop at filesystem root - parent_index_db = self.source_to_index_db(current.parent) - if parent_index_db.exists(): - project_root = current.parent - current = current.parent - else: - break - - return project_root - - def get_relative_depth(self, source_path: Path, project_root: Path) -> int: - """Calculate directory depth relative to project root. - - Args: - source_path: Target directory path - project_root: Project root directory path - - Returns: - Number of directory levels from project_root to source_path - - Raises: - ValueError: If source_path is not under project_root - - Examples: - >>> mapper = PathMapper() - >>> mapper.get_relative_depth( - ... Path("D:/Claude_dms3/src/codexlens"), - ... Path("D:/Claude_dms3") - ... ) - 2 - """ - source_path = source_path.resolve() - project_root = project_root.resolve() - - try: - relative = source_path.relative_to(project_root) - # Count path components - return len(relative.parts) - except ValueError: - raise ValueError( - f"Source path {source_path} is not under project root {project_root}" - ) - - def normalize_path(self, path: Path) -> str: - """Normalize path to cross-platform storage format. - - Converts OS-specific paths to a standardized format for storage: - - Windows: Removes drive colons (D: → D) - - Unix: Removes leading slash - - Uses forward slashes throughout - - Args: - path: Path to normalize - - Returns: - Normalized path string - - Examples: - >>> mapper = PathMapper() - >>> mapper.normalize_path(Path("D:/path/to/dir")) - 'D/path/to/dir' - - >>> mapper.normalize_path(Path("/home/user/path")) - 'home/user/path' - """ - path = path.resolve() - path_str = str(path) - - # Handle Windows paths with drive letters - if platform.system() == "Windows" and len(path.parts) > 0: - # Convert D:\path\to\dir → D/path/to/dir - drive = path.parts[0].replace(":", "") # D: → D - rest = Path(*path.parts[1:]) if len(path.parts) > 1 else Path() - normalized = f"{drive}/{rest}".replace("\\", "/") - return normalized.rstrip("/") - - # Handle Unix paths - # /home/user/path → home/user/path - return path_str.lstrip("/").replace("\\", "/") - - def denormalize_path(self, normalized: str) -> Path: - """Convert normalized path back to OS-specific path. - - Reverses the normalization process to restore OS-native path format: - - Windows: Adds drive colons (D → D:) - - Unix: Adds leading slash - - Args: - normalized: Normalized path string - - Returns: - OS-specific Path object - - Examples: - >>> mapper = PathMapper() - >>> mapper.denormalize_path("D/path/to/dir") # On Windows - WindowsPath('D:/path/to/dir') - - >>> mapper.denormalize_path("home/user/path") # On Unix - PosixPath('/home/user/path') - """ - parts = normalized.split("/") - - # Handle Windows paths - if platform.system() == "Windows" and len(parts) > 0: - # Check if first part is a drive letter - if len(parts[0]) == 1 and parts[0].isalpha(): - # D/path/to/dir → D:/path/to/dir - drive = f"{parts[0]}:/" - if len(parts) > 1: - return Path(drive) / Path(*parts[1:]) - return Path(drive) - - # Handle Unix paths or relative paths - # home/user/path → /home/user/path - return Path("/") / Path(*parts) diff --git a/codex-lens/src/codexlens/storage/registry.py b/codex-lens/src/codexlens/storage/registry.py deleted file mode 100644 index af667a90..00000000 --- a/codex-lens/src/codexlens/storage/registry.py +++ /dev/null @@ -1,733 +0,0 @@ -"""Global project registry for CodexLens - SQLite storage.""" - -from __future__ import annotations - -import platform -import sqlite3 -import threading -import time -from dataclasses import dataclass -from pathlib import Path -from typing import Any, Dict, List, Optional - -from codexlens.errors import StorageError - - -@dataclass -class ProjectInfo: - """Registered project information.""" - - id: int - source_root: Path - index_root: Path - created_at: float - last_indexed: float - total_files: int - total_dirs: int - status: str - - -@dataclass -class DirMapping: - """Directory to index path mapping.""" - - id: int - project_id: int - source_path: Path - index_path: Path - depth: int - files_count: int - last_updated: float - - -class RegistryStore: - """Global project registry - SQLite storage. - - Manages indexed projects and directory-to-index path mappings. - Thread-safe with connection pooling. - """ - - DEFAULT_DB_PATH = Path.home() / ".codexlens" / "registry.db" - - def __init__(self, db_path: Path | None = None) -> None: - self.db_path = (db_path or self.DEFAULT_DB_PATH).resolve() - self._lock = threading.RLock() - self._local = threading.local() - self._pool_lock = threading.Lock() - self._pool: Dict[int, sqlite3.Connection] = {} - self._pool_generation = 0 - - def _get_connection(self) -> sqlite3.Connection: - """Get or create a thread-local database connection.""" - thread_id = threading.get_ident() - if getattr(self._local, "generation", None) == self._pool_generation: - conn = getattr(self._local, "conn", None) - if conn is not None: - return conn - - with self._pool_lock: - conn = self._pool.get(thread_id) - if conn is None: - conn = sqlite3.connect(self.db_path, check_same_thread=False) - conn.row_factory = sqlite3.Row - conn.execute("PRAGMA journal_mode=WAL") - conn.execute("PRAGMA synchronous=NORMAL") - conn.execute("PRAGMA foreign_keys=ON") - self._pool[thread_id] = conn - - self._local.conn = conn - self._local.generation = self._pool_generation - return conn - - def close(self) -> None: - """Close all pooled connections.""" - with self._lock: - with self._pool_lock: - for conn in self._pool.values(): - conn.close() - self._pool.clear() - self._pool_generation += 1 - - if hasattr(self._local, "conn"): - self._local.conn = None - if hasattr(self._local, "generation"): - self._local.generation = self._pool_generation - - def __enter__(self) -> RegistryStore: - self.initialize() - return self - - def __exit__(self, exc_type: object, exc: object, tb: object) -> None: - self.close() - - def initialize(self) -> None: - """Create database and schema.""" - with self._lock: - self.db_path.parent.mkdir(parents=True, exist_ok=True) - conn = self._get_connection() - self._create_schema(conn) - - def _create_schema(self, conn: sqlite3.Connection) -> None: - """Create database schema.""" - try: - conn.execute( - """ - CREATE TABLE IF NOT EXISTS projects ( - id INTEGER PRIMARY KEY, - source_root TEXT UNIQUE NOT NULL, - index_root TEXT NOT NULL, - created_at REAL, - last_indexed REAL, - total_files INTEGER DEFAULT 0, - total_dirs INTEGER DEFAULT 0, - status TEXT DEFAULT 'active' - ) - """ - ) - - conn.execute( - """ - CREATE TABLE IF NOT EXISTS dir_mapping ( - id INTEGER PRIMARY KEY, - project_id INTEGER REFERENCES projects(id) ON DELETE CASCADE, - source_path TEXT NOT NULL, - index_path TEXT NOT NULL, - depth INTEGER, - files_count INTEGER DEFAULT 0, - last_updated REAL, - UNIQUE(source_path) - ) - """ - ) - - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_dir_source ON dir_mapping(source_path)" - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_dir_project ON dir_mapping(project_id)" - ) - conn.execute( - "CREATE INDEX IF NOT EXISTS idx_project_source ON projects(source_root)" - ) - - conn.commit() - except sqlite3.DatabaseError as exc: - raise StorageError(f"Failed to initialize registry schema: {exc}") from exc - - def _normalize_path_for_comparison(self, path: Path) -> str: - """Normalize paths for comparisons and storage. - - Windows paths are treated as case-insensitive, so normalize to lowercase. - Unix platforms preserve case sensitivity. - """ - path_str = str(path) - if platform.system() == "Windows": - return path_str.lower() - return path_str - - # === Project Operations === - - def register_project(self, source_root: Path, index_root: Path) -> ProjectInfo: - """Register a new project or update existing one. - - Args: - source_root: Source code root directory - index_root: Index storage root directory - - Returns: - ProjectInfo for the registered project - """ - with self._lock: - conn = self._get_connection() - source_root_str = self._normalize_path_for_comparison(source_root.resolve()) - index_root_str = str(index_root.resolve()) - now = time.time() - - conn.execute( - """ - INSERT INTO projects(source_root, index_root, created_at, last_indexed) - VALUES(?, ?, ?, ?) - ON CONFLICT(source_root) DO UPDATE SET - index_root=excluded.index_root, - last_indexed=excluded.last_indexed, - status='active' - """, - (source_root_str, index_root_str, now, now), - ) - - row = conn.execute( - "SELECT * FROM projects WHERE source_root=?", (source_root_str,) - ).fetchone() - - conn.commit() - - if not row: - raise StorageError(f"Failed to register project: {source_root}") - - return self._row_to_project_info(row) - - def unregister_project(self, source_root: Path) -> bool: - """Remove a project registration (cascades to directory mappings). - - Args: - source_root: Source code root directory - - Returns: - True if project was removed, False if not found - """ - with self._lock: - conn = self._get_connection() - source_root_str = self._normalize_path_for_comparison(source_root.resolve()) - - row = conn.execute( - "SELECT id FROM projects WHERE source_root=?", (source_root_str,) - ).fetchone() - - if not row: - return False - - conn.execute("DELETE FROM projects WHERE source_root=?", (source_root_str,)) - conn.commit() - return True - - def get_project(self, source_root: Path) -> Optional[ProjectInfo]: - """Get project information by source root. - - Args: - source_root: Source code root directory - - Returns: - ProjectInfo if found, None otherwise - """ - with self._lock: - conn = self._get_connection() - source_root_str = self._normalize_path_for_comparison(source_root.resolve()) - - row = conn.execute( - "SELECT * FROM projects WHERE source_root=?", (source_root_str,) - ).fetchone() - - return self._row_to_project_info(row) if row else None - - def get_project_by_id(self, project_id: int) -> Optional[ProjectInfo]: - """Get project information by ID. - - Args: - project_id: Project database ID - - Returns: - ProjectInfo if found, None otherwise - """ - with self._lock: - conn = self._get_connection() - - row = conn.execute( - "SELECT * FROM projects WHERE id=?", (project_id,) - ).fetchone() - - return self._row_to_project_info(row) if row else None - - def list_projects(self, status: Optional[str] = None) -> List[ProjectInfo]: - """List all registered projects. - - Args: - status: Optional status filter ('active', 'stale', 'removed') - - Returns: - List of ProjectInfo objects - """ - with self._lock: - conn = self._get_connection() - - if status: - rows = conn.execute( - "SELECT * FROM projects WHERE status=? ORDER BY created_at DESC", - (status,), - ).fetchall() - else: - rows = conn.execute( - "SELECT * FROM projects ORDER BY created_at DESC" - ).fetchall() - - return [self._row_to_project_info(row) for row in rows] - - def update_project_stats( - self, source_root: Path, total_files: int, total_dirs: int - ) -> None: - """Update project statistics. - - Args: - source_root: Source code root directory - total_files: Total number of indexed files - total_dirs: Total number of indexed directories - """ - with self._lock: - conn = self._get_connection() - source_root_str = self._normalize_path_for_comparison(source_root.resolve()) - - conn.execute( - """ - UPDATE projects - SET total_files=?, total_dirs=?, last_indexed=? - WHERE source_root=? - """, - (total_files, total_dirs, time.time(), source_root_str), - ) - conn.commit() - - def set_project_status(self, source_root: Path, status: str) -> None: - """Set project status. - - Args: - source_root: Source code root directory - status: Status string ('active', 'stale', 'removed') - """ - with self._lock: - conn = self._get_connection() - source_root_str = self._normalize_path_for_comparison(source_root.resolve()) - - conn.execute( - "UPDATE projects SET status=? WHERE source_root=?", - (status, source_root_str), - ) - conn.commit() - - # === Directory Mapping Operations === - - def register_dir( - self, - project_id: int, - source_path: Path, - index_path: Path, - depth: int, - files_count: int = 0, - ) -> DirMapping: - """Register a directory mapping. - - Args: - project_id: Project database ID - source_path: Source directory path - index_path: Index database path - depth: Directory depth relative to project root - files_count: Number of files in directory - - Returns: - DirMapping for the registered directory - """ - with self._lock: - conn = self._get_connection() - source_path_str = self._normalize_path_for_comparison(source_path.resolve()) - index_path_str = str(index_path.resolve()) - now = time.time() - - conn.execute( - """ - INSERT INTO dir_mapping( - project_id, source_path, index_path, depth, files_count, last_updated - ) - VALUES(?, ?, ?, ?, ?, ?) - ON CONFLICT(source_path) DO UPDATE SET - index_path=excluded.index_path, - depth=excluded.depth, - files_count=excluded.files_count, - last_updated=excluded.last_updated - """, - (project_id, source_path_str, index_path_str, depth, files_count, now), - ) - - row = conn.execute( - "SELECT * FROM dir_mapping WHERE source_path=?", (source_path_str,) - ).fetchone() - - conn.commit() - - if not row: - raise StorageError(f"Failed to register directory: {source_path}") - - return self._row_to_dir_mapping(row) - - def unregister_dir(self, source_path: Path) -> bool: - """Remove a directory mapping. - - Args: - source_path: Source directory path - - Returns: - True if directory was removed, False if not found - """ - with self._lock: - conn = self._get_connection() - source_path_str = self._normalize_path_for_comparison(source_path.resolve()) - - row = conn.execute( - "SELECT id FROM dir_mapping WHERE source_path=?", (source_path_str,) - ).fetchone() - - if not row: - return False - - conn.execute("DELETE FROM dir_mapping WHERE source_path=?", (source_path_str,)) - conn.commit() - return True - - def find_index_path(self, source_path: Path) -> Optional[Path]: - """Find index path for a source directory (exact match). - - Args: - source_path: Source directory path - - Returns: - Index path if found, None otherwise - """ - with self._lock: - conn = self._get_connection() - source_path_str = self._normalize_path_for_comparison(source_path.resolve()) - - row = conn.execute( - "SELECT index_path FROM dir_mapping WHERE source_path=?", - (source_path_str,), - ).fetchone() - - return Path(row["index_path"]) if row else None - - def find_nearest_index(self, source_path: Path) -> Optional[DirMapping]: - """Find nearest indexed ancestor directory. - - Searches for the closest parent directory that has an index. - Useful for supporting subdirectory searches. - - Optimized to use single database query instead of iterating through - each parent directory level. - - Args: - source_path: Source directory or file path - - Returns: - DirMapping for nearest ancestor, None if not found - """ - with self._lock: - conn = self._get_connection() - source_path_resolved = source_path.resolve() - - # Build list of all parent paths from deepest to shallowest - paths_to_check = [] - current = source_path_resolved - while True: - paths_to_check.append(self._normalize_path_for_comparison(current)) - parent = current.parent - if parent == current: # Reached filesystem root - break - current = parent - - if not paths_to_check: - return None - - # Single query with WHERE IN, ordered by path length (longest = nearest) - placeholders = ','.join('?' * len(paths_to_check)) - query = f""" - SELECT * FROM dir_mapping - WHERE source_path IN ({placeholders}) - ORDER BY LENGTH(source_path) DESC - LIMIT 1 - """ - - row = conn.execute(query, paths_to_check).fetchone() - return self._row_to_dir_mapping(row) if row else None - - def find_by_source_path(self, source_path: str) -> Optional[Dict[str, str]]: - """Find project by source path (exact or nearest match). - - Searches for a project whose source_root matches or contains - the given source_path. - - Args: - source_path: Source directory path as string - - Returns: - Dict with project info including 'index_root', or None if not found - """ - with self._lock: - conn = self._get_connection() - resolved_path = Path(source_path).resolve() - source_path_resolved = self._normalize_path_for_comparison(resolved_path) - - # First try exact match on projects table - row = conn.execute( - "SELECT * FROM projects WHERE source_root=?", (source_path_resolved,) - ).fetchone() - - if row: - return { - "id": str(row["id"]), - "source_root": row["source_root"], - "index_root": row["index_root"], - "status": row["status"] or "active", - } - - # Try finding project that contains this path - # Build list of all parent paths - paths_to_check = [] - current = resolved_path - while True: - paths_to_check.append(self._normalize_path_for_comparison(current)) - parent = current.parent - if parent == current: - break - current = parent - - if paths_to_check: - placeholders = ','.join('?' * len(paths_to_check)) - query = f""" - SELECT * FROM projects - WHERE source_root IN ({placeholders}) - ORDER BY LENGTH(source_root) DESC - LIMIT 1 - """ - row = conn.execute(query, paths_to_check).fetchone() - - if row: - return { - "id": str(row["id"]), - "source_root": row["source_root"], - "index_root": row["index_root"], - "status": row["status"] or "active", - } - - return None - - def get_project_dirs(self, project_id: int) -> List[DirMapping]: - """Get all directory mappings for a project. - - Args: - project_id: Project database ID - - Returns: - List of DirMapping objects - """ - with self._lock: - conn = self._get_connection() - - rows = conn.execute( - "SELECT * FROM dir_mapping WHERE project_id=? ORDER BY depth, source_path", - (project_id,), - ).fetchall() - - return [self._row_to_dir_mapping(row) for row in rows] - - def get_subdirs(self, source_path: Path) -> List[DirMapping]: - """Get direct subdirectory mappings. - - Args: - source_path: Parent directory path - - Returns: - List of DirMapping objects for direct children - """ - with self._lock: - conn = self._get_connection() - source_path_str = self._normalize_path_for_comparison(source_path.resolve()) - - # First get the parent's depth - parent_row = conn.execute( - "SELECT depth, project_id FROM dir_mapping WHERE source_path=?", - (source_path_str,), - ).fetchone() - - if not parent_row: - return [] - - parent_depth = int(parent_row["depth"]) - project_id = int(parent_row["project_id"]) - - # Get all subdirs with depth = parent_depth + 1 and matching path prefix - rows = conn.execute( - """ - SELECT * FROM dir_mapping - WHERE project_id=? AND depth=? AND source_path LIKE ? - ORDER BY source_path - """, - (project_id, parent_depth + 1, f"{source_path_str}%"), - ).fetchall() - - return [self._row_to_dir_mapping(row) for row in rows] - - def find_descendant_project_roots(self, source_root: Path) -> List[DirMapping]: - """Return root directory mappings for nested projects under ``source_root``.""" - with self._lock: - conn = self._get_connection() - source_root_resolved = source_root.resolve() - source_root_str = self._normalize_path_for_comparison(source_root_resolved) - - rows = conn.execute( - """ - SELECT dm.* - FROM dir_mapping dm - INNER JOIN projects p ON p.id = dm.project_id - WHERE dm.source_path = p.source_root - AND p.source_root LIKE ? - ORDER BY p.source_root ASC - """, - (f"{source_root_str}%",), - ).fetchall() - - descendant_roots: List[DirMapping] = [] - normalized_root_path = Path(source_root_str) - - for row in rows: - mapping = self._row_to_dir_mapping(row) - normalized_mapping_path = Path( - self._normalize_path_for_comparison(mapping.source_path.resolve()) - ) - - if normalized_mapping_path == normalized_root_path: - continue - - try: - normalized_mapping_path.relative_to(normalized_root_path) - except ValueError: - continue - - descendant_roots.append(mapping) - - descendant_roots.sort( - key=lambda mapping: ( - len( - mapping.source_path.resolve().relative_to( - source_root_resolved - ).parts - ), - self._normalize_path_for_comparison(mapping.source_path.resolve()), - ) - ) - return descendant_roots - - def update_dir_stats(self, source_path: Path, files_count: int) -> None: - """Update directory statistics. - - Args: - source_path: Source directory path - files_count: Number of files in directory - """ - with self._lock: - conn = self._get_connection() - source_path_str = self._normalize_path_for_comparison(source_path.resolve()) - - conn.execute( - """ - UPDATE dir_mapping - SET files_count=?, last_updated=? - WHERE source_path=? - """, - (files_count, time.time(), source_path_str), - ) - conn.commit() - - def update_index_paths(self, old_root: Path, new_root: Path) -> int: - """Update all index paths after migration. - - Replaces old_root prefix with new_root in all stored index paths. - - Args: - old_root: Old index root directory - new_root: New index root directory - - Returns: - Number of paths updated - """ - with self._lock: - conn = self._get_connection() - old_root_str = str(old_root.resolve()) - new_root_str = str(new_root.resolve()) - updated = 0 - - # Update projects - conn.execute( - """ - UPDATE projects - SET index_root = REPLACE(index_root, ?, ?) - WHERE index_root LIKE ? - """, - (old_root_str, new_root_str, f"{old_root_str}%"), - ) - updated += conn.total_changes - - # Update dir_mapping - conn.execute( - """ - UPDATE dir_mapping - SET index_path = REPLACE(index_path, ?, ?) - WHERE index_path LIKE ? - """, - (old_root_str, new_root_str, f"{old_root_str}%"), - ) - updated += conn.total_changes - - conn.commit() - return updated - - # === Internal Methods === - - def _row_to_project_info(self, row: sqlite3.Row) -> ProjectInfo: - """Convert database row to ProjectInfo.""" - return ProjectInfo( - id=int(row["id"]), - source_root=Path(row["source_root"]), - index_root=Path(row["index_root"]), - created_at=float(row["created_at"]) if row["created_at"] else 0.0, - last_indexed=float(row["last_indexed"]) if row["last_indexed"] else 0.0, - total_files=int(row["total_files"]) if row["total_files"] else 0, - total_dirs=int(row["total_dirs"]) if row["total_dirs"] else 0, - status=str(row["status"]) if row["status"] else "active", - ) - - def _row_to_dir_mapping(self, row: sqlite3.Row) -> DirMapping: - """Convert database row to DirMapping.""" - return DirMapping( - id=int(row["id"]), - project_id=int(row["project_id"]), - source_path=Path(row["source_path"]), - index_path=Path(row["index_path"]), - depth=int(row["depth"]) if row["depth"] is not None else 0, - files_count=int(row["files_count"]) if row["files_count"] else 0, - last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0, - ) diff --git a/codex-lens/src/codexlens/storage/sqlite_store.py b/codex-lens/src/codexlens/storage/sqlite_store.py deleted file mode 100644 index 6945be8a..00000000 --- a/codex-lens/src/codexlens/storage/sqlite_store.py +++ /dev/null @@ -1,976 +0,0 @@ -"""SQLite storage for CodexLens indexing and search.""" - -from __future__ import annotations - -import json -import logging -import sqlite3 -import threading -import time -from dataclasses import asdict -from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Tuple - -from codexlens.entities import IndexedFile, SearchResult, Symbol -from codexlens.errors import StorageError - -logger = logging.getLogger(__name__) - - -class SQLiteStore: - """SQLiteStore providing FTS5 search and symbol lookup. - - Implements thread-local connection pooling for improved performance. - """ - - # Maximum number of connections to keep in pool to prevent memory leaks - MAX_POOL_SIZE = 32 - # Idle timeout in seconds (10 minutes) - IDLE_TIMEOUT = 600 - # Periodic cleanup interval in seconds (5 minutes) - CLEANUP_INTERVAL = 300 - - def __init__(self, db_path: str | Path) -> None: - self.db_path = Path(db_path) - self._lock = threading.RLock() - self._local = threading.local() - self._pool_lock = threading.Lock() - # Pool stores (connection, last_access_time) tuples - self._pool: Dict[int, Tuple[sqlite3.Connection, float]] = {} - self._pool_generation = 0 - self._cleanup_timer: threading.Timer | None = None - self._cleanup_stop_event = threading.Event() - self._start_cleanup_timer() - - def _get_connection(self) -> sqlite3.Connection: - """Get or create a thread-local database connection.""" - thread_id = threading.get_ident() - current_time = time.time() - - if getattr(self._local, "generation", None) == self._pool_generation: - conn = getattr(self._local, "conn", None) - if conn is not None: - with self._pool_lock: - pool_entry = self._pool.get(thread_id) - if pool_entry is not None: - pooled_conn, _ = pool_entry - self._pool[thread_id] = (pooled_conn, current_time) - self._local.conn = pooled_conn - return pooled_conn - - # Thread-local connection is stale (e.g., cleaned up by timer). - self._local.conn = None - - with self._pool_lock: - pool_entry = self._pool.get(thread_id) - if pool_entry is not None: - conn, _ = pool_entry - # Update last access time - self._pool[thread_id] = (conn, current_time) - else: - # Clean up stale and idle connections if pool is too large - if len(self._pool) >= self.MAX_POOL_SIZE: - self._cleanup_stale_connections() - - conn = sqlite3.connect(self.db_path, check_same_thread=False) - conn.row_factory = sqlite3.Row - conn.execute("PRAGMA journal_mode=WAL") - conn.execute("PRAGMA synchronous=NORMAL") - conn.execute("PRAGMA foreign_keys=ON") - # Memory-mapped I/O for faster reads (30GB limit) - conn.execute("PRAGMA mmap_size=30000000000") - self._pool[thread_id] = (conn, current_time) - - self._local.conn = conn - self._local.generation = self._pool_generation - return conn - - def _cleanup_stale_connections(self) -> None: - """Remove connections for threads that no longer exist or have been idle too long.""" - current_time = time.time() - # Get list of active thread IDs - active_threads = {t.ident for t in threading.enumerate() if t.ident is not None} - - # Find connections to remove: dead threads or idle timeout exceeded - stale_ids: list[tuple[int, str]] = [] - for tid, (conn, last_access) in list(self._pool.items()): - try: - is_dead_thread = tid not in active_threads - is_idle = (current_time - last_access) > self.IDLE_TIMEOUT - - is_invalid_connection = False - if not is_dead_thread and not is_idle: - try: - conn.execute("SELECT 1").fetchone() - except sqlite3.ProgrammingError: - is_invalid_connection = True - except sqlite3.Error: - is_invalid_connection = True - - if is_invalid_connection: - stale_ids.append((tid, "invalid_connection")) - elif is_dead_thread: - stale_ids.append((tid, "dead_thread")) - elif is_idle: - stale_ids.append((tid, "idle_timeout")) - except Exception: - # Never break cleanup for a single bad entry. - continue - - # Close and remove stale connections - for tid, reason in stale_ids: - try: - conn, _ = self._pool[tid] - conn.close() - except Exception: - pass - del self._pool[tid] - logger.debug("Cleaned SQLiteStore connection for thread_id=%s (%s)", tid, reason) - - def _start_cleanup_timer(self) -> None: - if self.CLEANUP_INTERVAL <= 0: - return - - self._cleanup_stop_event.clear() - - def tick() -> None: - if self._cleanup_stop_event.is_set(): - return - - try: - with self._pool_lock: - self._cleanup_stale_connections() - finally: - with self._pool_lock: - if self._cleanup_stop_event.is_set(): - self._cleanup_timer = None - return - - self._cleanup_timer = threading.Timer(self.CLEANUP_INTERVAL, tick) - self._cleanup_timer.daemon = True - self._cleanup_timer.start() - - self._cleanup_timer = threading.Timer(self.CLEANUP_INTERVAL, tick) - self._cleanup_timer.daemon = True - self._cleanup_timer.start() - - def _stop_cleanup_timer(self) -> None: - self._cleanup_stop_event.set() - with self._pool_lock: - if self._cleanup_timer is not None: - self._cleanup_timer.cancel() - self._cleanup_timer = None - - def close(self) -> None: - """Close all pooled connections.""" - with self._lock: - self._stop_cleanup_timer() - with self._pool_lock: - for conn, _ in self._pool.values(): - conn.close() - self._pool.clear() - self._pool_generation += 1 - - if hasattr(self._local, "conn"): - self._local.conn = None - if hasattr(self._local, "generation"): - self._local.generation = self._pool_generation - - def __enter__(self) -> SQLiteStore: - self.initialize() - return self - - def __exit__(self, exc_type: object, exc: object, tb: object) -> None: - self.close() - - def execute_query( - self, - sql: str, - params: tuple = (), - allow_writes: bool = False - ) -> List[Dict[str, Any]]: - """Execute a raw SQL query and return results as dictionaries. - - This is the public API for executing custom queries without bypassing - encapsulation via _get_connection(). - - By default, only SELECT queries are allowed. Use allow_writes=True - for trusted internal code that needs to execute other statements. - - Args: - sql: SQL query string with ? placeholders for parameters - params: Tuple of parameter values to bind - allow_writes: If True, allow non-SELECT statements (default False) - - Returns: - List of result rows as dictionaries - - Raises: - StorageError: If query execution fails or validation fails - """ - # Validate query type for security - sql_stripped = sql.strip().upper() - if not allow_writes: - # Only allow SELECT and WITH (for CTEs) statements - if not (sql_stripped.startswith("SELECT") or sql_stripped.startswith("WITH")): - raise StorageError( - "Only SELECT queries are allowed. " - "Use allow_writes=True for trusted internal operations.", - db_path=str(self.db_path), - operation="execute_query", - details={"query_type": sql_stripped.split()[0] if sql_stripped else "EMPTY"} - ) - - try: - conn = self._get_connection() - rows = conn.execute(sql, params).fetchall() - return [dict(row) for row in rows] - except sqlite3.Error as e: - raise StorageError( - f"Query execution failed: {e}", - db_path=str(self.db_path), - operation="execute_query", - details={"error_type": type(e).__name__} - ) from e - - def initialize(self) -> None: - with self._lock: - self.db_path.parent.mkdir(parents=True, exist_ok=True) - conn = self._get_connection() - self._create_schema(conn) - self._ensure_fts_external_content(conn) - - - def add_file(self, indexed_file: IndexedFile, content: str) -> None: - with self._lock: - conn = self._get_connection() - path = str(Path(indexed_file.path).resolve()) - language = indexed_file.language - mtime = Path(path).stat().st_mtime if Path(path).exists() else None - line_count = content.count(chr(10)) + 1 - - conn.execute( - """ - INSERT INTO files(path, language, content, mtime, line_count) - VALUES(?, ?, ?, ?, ?) - ON CONFLICT(path) DO UPDATE SET - language=excluded.language, - content=excluded.content, - mtime=excluded.mtime, - line_count=excluded.line_count - """, - (path, language, content, mtime, line_count), - ) - - row = conn.execute("SELECT id FROM files WHERE path=?", (path,)).fetchone() - if not row: - raise StorageError(f"Failed to read file id for {path}") - file_id = int(row["id"]) - - conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,)) - if indexed_file.symbols: - conn.executemany( - """ - INSERT INTO symbols(file_id, name, kind, start_line, end_line) - VALUES(?, ?, ?, ?, ?) - """, - [ - (file_id, s.name, s.kind, s.range[0], s.range[1]) - for s in indexed_file.symbols - ], - ) - conn.commit() - - def add_files(self, files_data: List[tuple[IndexedFile, str]]) -> None: - """Add multiple files in a single transaction for better performance. - - Args: - files_data: List of (indexed_file, content) tuples - """ - with self._lock: - conn = self._get_connection() - try: - conn.execute("BEGIN") - - for indexed_file, content in files_data: - path = str(Path(indexed_file.path).resolve()) - language = indexed_file.language - mtime = Path(path).stat().st_mtime if Path(path).exists() else None - line_count = content.count(chr(10)) + 1 - - conn.execute( - """ - INSERT INTO files(path, language, content, mtime, line_count) - VALUES(?, ?, ?, ?, ?) - ON CONFLICT(path) DO UPDATE SET - language=excluded.language, - content=excluded.content, - mtime=excluded.mtime, - line_count=excluded.line_count - """, - (path, language, content, mtime, line_count), - ) - - row = conn.execute("SELECT id FROM files WHERE path=?", (path,)).fetchone() - if not row: - raise StorageError(f"Failed to read file id for {path}") - file_id = int(row["id"]) - - conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,)) - if indexed_file.symbols: - conn.executemany( - """ - INSERT INTO symbols(file_id, name, kind, start_line, end_line) - VALUES(?, ?, ?, ?, ?) - """, - [ - (file_id, s.name, s.kind, s.range[0], s.range[1]) - for s in indexed_file.symbols - ], - ) - - conn.commit() - except Exception as exc: - try: - conn.rollback() - except Exception as rollback_exc: - logger.error( - "Rollback failed after add_files() error (%s): %s", exc, rollback_exc - ) - raise exc.with_traceback(exc.__traceback__) from rollback_exc - raise - - def remove_file(self, path: str | Path) -> bool: - """Remove a file from the index.""" - with self._lock: - conn = self._get_connection() - resolved_path = str(Path(path).resolve()) - - row = conn.execute( - "SELECT id FROM files WHERE path=?", (resolved_path,) - ).fetchone() - - if not row: - return False - - file_id = int(row["id"]) - conn.execute("DELETE FROM files WHERE id=?", (file_id,)) - conn.commit() - return True - - def file_exists(self, path: str | Path) -> bool: - """Check if a file exists in the index.""" - with self._lock: - conn = self._get_connection() - resolved_path = str(Path(path).resolve()) - row = conn.execute( - "SELECT 1 FROM files WHERE path=?", (resolved_path,) - ).fetchone() - return row is not None - - def get_file_mtime(self, path: str | Path) -> float | None: - """Get the stored mtime for a file.""" - with self._lock: - conn = self._get_connection() - resolved_path = str(Path(path).resolve()) - row = conn.execute( - "SELECT mtime FROM files WHERE path=?", (resolved_path,) - ).fetchone() - return float(row["mtime"]) if row and row["mtime"] else None - - - def search_fts(self, query: str, *, limit: int = 20, offset: int = 0) -> List[SearchResult]: - with self._lock: - conn = self._get_connection() - try: - rows = conn.execute( - """ - SELECT rowid, path, bm25(files_fts) AS rank, - snippet(files_fts, 2, '[bold red]', '[/bold red]', "...", 20) AS excerpt - FROM files_fts - WHERE files_fts MATCH ? - ORDER BY rank - LIMIT ? OFFSET ? - """, - (query, limit, offset), - ).fetchall() - except sqlite3.DatabaseError as exc: - raise StorageError(f"FTS search failed: {exc}") from exc - - results: List[SearchResult] = [] - for row in rows: - rank = float(row["rank"]) if row["rank"] is not None else 0.0 - score = abs(rank) if rank < 0 else 0.0 - results.append( - SearchResult( - path=row["path"], - score=score, - excerpt=row["excerpt"], - ) - ) - return results - - def search_files_only( - self, query: str, *, limit: int = 20, offset: int = 0 - ) -> List[str]: - """Search indexed file contents and return only file paths.""" - with self._lock: - conn = self._get_connection() - try: - rows = conn.execute( - """ - SELECT path - FROM files_fts - WHERE files_fts MATCH ? - ORDER BY bm25(files_fts) - LIMIT ? OFFSET ? - """, - (query, limit, offset), - ).fetchall() - except sqlite3.DatabaseError as exc: - raise StorageError(f"FTS search failed: {exc}") from exc - - return [row["path"] for row in rows] - - def search_symbols( - self, name: str, *, kind: Optional[str] = None, limit: int = 50 - ) -> List[Symbol]: - pattern = f"%{name}%" - with self._lock: - conn = self._get_connection() - if kind: - rows = conn.execute( - """ - SELECT name, kind, start_line, end_line - FROM symbols - WHERE name LIKE ? AND kind=? - ORDER BY name - LIMIT ? - """, - (pattern, kind, limit), - ).fetchall() - else: - rows = conn.execute( - """ - SELECT name, kind, start_line, end_line - FROM symbols - WHERE name LIKE ? - ORDER BY name - LIMIT ? - """, - (pattern, limit), - ).fetchall() - - return [ - Symbol(name=row["name"], kind=row["kind"], range=(row["start_line"], row["end_line"])) - for row in rows - ] - - - def stats(self) -> Dict[str, Any]: - with self._lock: - conn = self._get_connection() - file_count = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone()["c"] - symbol_count = conn.execute("SELECT COUNT(*) AS c FROM symbols").fetchone()["c"] - lang_rows = conn.execute( - "SELECT language, COUNT(*) AS c FROM files GROUP BY language ORDER BY c DESC" - ).fetchall() - languages = {row["language"]: row["c"] for row in lang_rows} - # Include relationship count if table exists - relationship_count = 0 - try: - rel_row = conn.execute("SELECT COUNT(*) AS c FROM code_relationships").fetchone() - relationship_count = int(rel_row["c"]) if rel_row else 0 - except sqlite3.DatabaseError: - pass - - return { - "files": int(file_count), - "symbols": int(symbol_count), - "relationships": relationship_count, - "languages": languages, - "db_path": str(self.db_path), - } - - - def _connect(self) -> sqlite3.Connection: - """Legacy method for backward compatibility.""" - return self._get_connection() - - def _create_schema(self, conn: sqlite3.Connection) -> None: - try: - conn.execute( - """ - CREATE TABLE IF NOT EXISTS files ( - id INTEGER PRIMARY KEY, - path TEXT UNIQUE NOT NULL, - language TEXT NOT NULL, - content TEXT NOT NULL, - mtime REAL, - line_count INTEGER - ) - """ - ) - conn.execute( - """ - CREATE TABLE IF NOT EXISTS symbols ( - id INTEGER PRIMARY KEY, - file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE, - name TEXT NOT NULL, - kind TEXT NOT NULL, - start_line INTEGER NOT NULL, - end_line INTEGER NOT NULL - ) - """ - ) - conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_kind ON symbols(kind)") - conn.execute( - """ - CREATE TABLE IF NOT EXISTS code_relationships ( - id INTEGER PRIMARY KEY, - source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE, - target_qualified_name TEXT NOT NULL, - relationship_type TEXT NOT NULL, - source_line INTEGER NOT NULL, - target_file TEXT - ) - """ - ) - conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)") - conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)") - # Chunks table for multi-vector storage (cascade retrieval architecture) - # - embedding: Original embedding for backward compatibility - # - embedding_binary: 256-dim binary vector for coarse ranking - # - embedding_dense: 2048-dim dense vector for fine ranking - conn.execute( - """ - CREATE TABLE IF NOT EXISTS chunks ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - file_path TEXT NOT NULL, - content TEXT NOT NULL, - embedding BLOB, - embedding_binary BLOB, - embedding_dense BLOB, - metadata TEXT, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - """ - ) - conn.execute("CREATE INDEX IF NOT EXISTS idx_chunks_file_path ON chunks(file_path)") - # Run migration for existing databases - self._migrate_chunks_table(conn) - conn.commit() - except sqlite3.DatabaseError as exc: - raise StorageError(f"Failed to initialize database schema: {exc}") from exc - - def _ensure_fts_external_content(self, conn: sqlite3.Connection) -> None: - """Ensure files_fts is an FTS5 external-content table (no content duplication).""" - try: - sql_row = conn.execute( - "SELECT sql FROM sqlite_master WHERE type='table' AND name='files_fts'" - ).fetchone() - sql = str(sql_row["sql"]) if sql_row and sql_row["sql"] else None - - if sql is None: - self._create_external_fts(conn) - conn.commit() - return - - if ( - "content='files'" in sql - or 'content="files"' in sql - or "content=files" in sql - ): - self._create_fts_triggers(conn) - conn.commit() - return - - self._migrate_fts_to_external(conn) - except sqlite3.DatabaseError as exc: - raise StorageError(f"Failed to ensure FTS schema: {exc}") from exc - - def _create_external_fts(self, conn: sqlite3.Connection) -> None: - conn.execute( - """ - CREATE VIRTUAL TABLE files_fts USING fts5( - path UNINDEXED, - language UNINDEXED, - content, - content='files', - content_rowid='id', - tokenize="unicode61 tokenchars '_'" - ) - """ - ) - self._create_fts_triggers(conn) - - def _create_fts_triggers(self, conn: sqlite3.Connection) -> None: - conn.execute( - """ - CREATE TRIGGER IF NOT EXISTS files_ai AFTER INSERT ON files BEGIN - INSERT INTO files_fts(rowid, path, language, content) - VALUES(new.id, new.path, new.language, new.content); - END - """ - ) - conn.execute( - """ - CREATE TRIGGER IF NOT EXISTS files_ad AFTER DELETE ON files BEGIN - INSERT INTO files_fts(files_fts, rowid, path, language, content) - VALUES('delete', old.id, old.path, old.language, old.content); - END - """ - ) - conn.execute( - """ - CREATE TRIGGER IF NOT EXISTS files_au AFTER UPDATE ON files BEGIN - INSERT INTO files_fts(files_fts, rowid, path, language, content) - VALUES('delete', old.id, old.path, old.language, old.content); - INSERT INTO files_fts(rowid, path, language, content) - VALUES(new.id, new.path, new.language, new.content); - END - """ - ) - - def _migrate_fts_to_external(self, conn: sqlite3.Connection) -> None: - """Migrate legacy files_fts (with duplicated content) to external content.""" - try: - conn.execute("BEGIN") - conn.execute("DROP TRIGGER IF EXISTS files_ai") - conn.execute("DROP TRIGGER IF EXISTS files_ad") - conn.execute("DROP TRIGGER IF EXISTS files_au") - - conn.execute("ALTER TABLE files_fts RENAME TO files_fts_legacy") - self._create_external_fts(conn) - conn.execute("INSERT INTO files_fts(files_fts) VALUES('rebuild')") - conn.execute("DROP TABLE files_fts_legacy") - conn.commit() - except sqlite3.DatabaseError as exc: - try: - conn.rollback() - except Exception as rollback_exc: - logger.error( - "Rollback failed during FTS schema migration (%s): %s", exc, rollback_exc - ) - raise exc.with_traceback(exc.__traceback__) from rollback_exc - - try: - conn.execute("DROP TABLE IF EXISTS files_fts") - except Exception: - pass - - try: - conn.execute("ALTER TABLE files_fts_legacy RENAME TO files_fts") - conn.commit() - except Exception: - pass - raise - - try: - conn.execute("VACUUM") - except sqlite3.DatabaseError: - pass - - def _migrate_chunks_table(self, conn: sqlite3.Connection) -> None: - """Migrate existing chunks table to add multi-vector columns if needed. - - This handles upgrading existing databases that may have the chunks table - without the embedding_binary and embedding_dense columns. - """ - # Check if chunks table exists - table_exists = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'" - ).fetchone() - - if not table_exists: - # Table doesn't exist yet, nothing to migrate - return - - # Check existing columns - cursor = conn.execute("PRAGMA table_info(chunks)") - columns = {row[1] for row in cursor.fetchall()} - - # Add embedding_binary column if missing - if "embedding_binary" not in columns: - logger.info("Migrating chunks table: adding embedding_binary column") - conn.execute( - "ALTER TABLE chunks ADD COLUMN embedding_binary BLOB" - ) - - # Add embedding_dense column if missing - if "embedding_dense" not in columns: - logger.info("Migrating chunks table: adding embedding_dense column") - conn.execute( - "ALTER TABLE chunks ADD COLUMN embedding_dense BLOB" - ) - - def add_chunks( - self, - file_path: str, - chunks_data: List[Dict[str, Any]], - *, - embedding: Optional[List[List[float]]] = None, - embedding_binary: Optional[List[bytes]] = None, - embedding_dense: Optional[List[bytes]] = None, - ) -> List[int]: - """Add multiple chunks with multi-vector embeddings support. - - This method supports the cascade retrieval architecture with three embedding types: - - embedding: Original dense embedding for backward compatibility - - embedding_binary: 256-dim binary vector for fast coarse ranking - - embedding_dense: 2048-dim dense vector for precise fine ranking - - Args: - file_path: Path to the source file for all chunks. - chunks_data: List of dicts with 'content' and optional 'metadata' keys. - embedding: Optional list of dense embeddings (one per chunk). - embedding_binary: Optional list of binary embeddings as bytes (one per chunk). - embedding_dense: Optional list of dense embeddings as bytes (one per chunk). - - Returns: - List of inserted chunk IDs. - - Raises: - ValueError: If embedding list lengths don't match chunks_data length. - StorageError: If database operation fails. - """ - if not chunks_data: - return [] - - n_chunks = len(chunks_data) - - # Validate embedding lengths - if embedding is not None and len(embedding) != n_chunks: - raise ValueError( - f"embedding length ({len(embedding)}) != chunks_data length ({n_chunks})" - ) - if embedding_binary is not None and len(embedding_binary) != n_chunks: - raise ValueError( - f"embedding_binary length ({len(embedding_binary)}) != chunks_data length ({n_chunks})" - ) - if embedding_dense is not None and len(embedding_dense) != n_chunks: - raise ValueError( - f"embedding_dense length ({len(embedding_dense)}) != chunks_data length ({n_chunks})" - ) - - # Prepare batch data - batch_data = [] - for i, chunk in enumerate(chunks_data): - content = chunk.get("content", "") - metadata = chunk.get("metadata") - metadata_json = json.dumps(metadata) if metadata else None - - # Convert embeddings to bytes if needed - emb_blob = None - if embedding is not None: - import struct - emb_blob = struct.pack(f"{len(embedding[i])}f", *embedding[i]) - - emb_binary_blob = embedding_binary[i] if embedding_binary is not None else None - emb_dense_blob = embedding_dense[i] if embedding_dense is not None else None - - batch_data.append(( - file_path, content, emb_blob, emb_binary_blob, emb_dense_blob, metadata_json - )) - - with self._lock: - conn = self._get_connection() - try: - # Get starting ID before insert - row = conn.execute("SELECT MAX(id) FROM chunks").fetchone() - start_id = (row[0] or 0) + 1 - - conn.executemany( - """ - INSERT INTO chunks ( - file_path, content, embedding, embedding_binary, - embedding_dense, metadata - ) - VALUES (?, ?, ?, ?, ?, ?) - """, - batch_data - ) - conn.commit() - - # Calculate inserted IDs - return list(range(start_id, start_id + n_chunks)) - - except sqlite3.DatabaseError as exc: - raise StorageError( - f"Failed to add chunks: {exc}", - db_path=str(self.db_path), - operation="add_chunks", - ) from exc - - def get_binary_embeddings( - self, chunk_ids: List[int] - ) -> Dict[int, Optional[bytes]]: - """Get binary embeddings for specified chunk IDs. - - Used for coarse ranking in cascade retrieval architecture. - Binary embeddings (256-dim) enable fast approximate similarity search. - - Args: - chunk_ids: List of chunk IDs to retrieve embeddings for. - - Returns: - Dictionary mapping chunk_id to embedding_binary bytes (or None if not set). - - Raises: - StorageError: If database query fails. - """ - if not chunk_ids: - return {} - - with self._lock: - conn = self._get_connection() - try: - placeholders = ",".join("?" * len(chunk_ids)) - rows = conn.execute( - f"SELECT id, embedding_binary FROM chunks WHERE id IN ({placeholders})", - chunk_ids - ).fetchall() - - return {row["id"]: row["embedding_binary"] for row in rows} - - except sqlite3.DatabaseError as exc: - raise StorageError( - f"Failed to get binary embeddings: {exc}", - db_path=str(self.db_path), - operation="get_binary_embeddings", - ) from exc - - def get_dense_embeddings( - self, chunk_ids: List[int] - ) -> Dict[int, Optional[bytes]]: - """Get dense embeddings for specified chunk IDs. - - Used for fine ranking in cascade retrieval architecture. - Dense embeddings (2048-dim) provide high-precision similarity scoring. - - Args: - chunk_ids: List of chunk IDs to retrieve embeddings for. - - Returns: - Dictionary mapping chunk_id to embedding_dense bytes (or None if not set). - - Raises: - StorageError: If database query fails. - """ - if not chunk_ids: - return {} - - with self._lock: - conn = self._get_connection() - try: - placeholders = ",".join("?" * len(chunk_ids)) - rows = conn.execute( - f"SELECT id, embedding_dense FROM chunks WHERE id IN ({placeholders})", - chunk_ids - ).fetchall() - - return {row["id"]: row["embedding_dense"] for row in rows} - - except sqlite3.DatabaseError as exc: - raise StorageError( - f"Failed to get dense embeddings: {exc}", - db_path=str(self.db_path), - operation="get_dense_embeddings", - ) from exc - - def get_chunks_by_ids( - self, chunk_ids: List[int] - ) -> List[Dict[str, Any]]: - """Get chunk data for specified IDs. - - Args: - chunk_ids: List of chunk IDs to retrieve. - - Returns: - List of chunk dictionaries with id, file_path, content, metadata. - - Raises: - StorageError: If database query fails. - """ - if not chunk_ids: - return [] - - with self._lock: - conn = self._get_connection() - try: - placeholders = ",".join("?" * len(chunk_ids)) - rows = conn.execute( - f""" - SELECT id, file_path, content, metadata, created_at - FROM chunks - WHERE id IN ({placeholders}) - """, - chunk_ids - ).fetchall() - - results = [] - for row in rows: - metadata = None - if row["metadata"]: - try: - metadata = json.loads(row["metadata"]) - except json.JSONDecodeError: - pass - - results.append({ - "id": row["id"], - "file_path": row["file_path"], - "content": row["content"], - "metadata": metadata, - "created_at": row["created_at"], - }) - - return results - - except sqlite3.DatabaseError as exc: - raise StorageError( - f"Failed to get chunks: {exc}", - db_path=str(self.db_path), - operation="get_chunks_by_ids", - ) from exc - - def delete_chunks_by_file(self, file_path: str) -> int: - """Delete all chunks for a given file path. - - Args: - file_path: Path to the source file. - - Returns: - Number of deleted chunks. - - Raises: - StorageError: If database operation fails. - """ - with self._lock: - conn = self._get_connection() - try: - cursor = conn.execute( - "DELETE FROM chunks WHERE file_path = ?", - (file_path,) - ) - conn.commit() - return cursor.rowcount - - except sqlite3.DatabaseError as exc: - raise StorageError( - f"Failed to delete chunks: {exc}", - db_path=str(self.db_path), - operation="delete_chunks_by_file", - ) from exc - - def count_chunks(self) -> int: - """Count total chunks in store. - - Returns: - Total number of chunks. - """ - with self._lock: - conn = self._get_connection() - row = conn.execute("SELECT COUNT(*) AS c FROM chunks").fetchone() - return int(row["c"]) if row else 0 diff --git a/codex-lens/src/codexlens/storage/sqlite_utils.py b/codex-lens/src/codexlens/storage/sqlite_utils.py deleted file mode 100644 index 2d5730f9..00000000 --- a/codex-lens/src/codexlens/storage/sqlite_utils.py +++ /dev/null @@ -1,64 +0,0 @@ -"""SQLite utility functions for CodexLens storage layer.""" - -from __future__ import annotations - -import logging -import sqlite3 - -log = logging.getLogger(__name__) - - -def check_trigram_support(conn: sqlite3.Connection) -> bool: - """Check if SQLite supports trigram tokenizer for FTS5. - - Trigram tokenizer requires SQLite >= 3.34.0. - - Args: - conn: Database connection to test - - Returns: - True if trigram tokenizer is available, False otherwise - """ - try: - # Test by creating a temporary virtual table with trigram tokenizer - conn.execute( - """ - CREATE VIRTUAL TABLE IF NOT EXISTS test_trigram_check - USING fts5(test_content, tokenize='trigram') - """ - ) - # Clean up test table - conn.execute("DROP TABLE IF EXISTS test_trigram_check") - conn.commit() - return True - except sqlite3.OperationalError as e: - # Trigram tokenizer not available - if "unrecognized tokenizer" in str(e).lower(): - log.debug("Trigram tokenizer not available in this SQLite version") - return False - # Other operational errors should be re-raised - raise - except Exception: - # Any other exception means trigram is not supported - return False - - -def get_sqlite_version(conn: sqlite3.Connection) -> tuple[int, int, int]: - """Get SQLite version as (major, minor, patch) tuple. - - Args: - conn: Database connection - - Returns: - Version tuple, e.g., (3, 34, 1) - """ - row = conn.execute("SELECT sqlite_version()").fetchone() - version_str = row[0] if row else "0.0.0" - parts = version_str.split('.') - try: - major = int(parts[0]) if len(parts) > 0 else 0 - minor = int(parts[1]) if len(parts) > 1 else 0 - patch = int(parts[2]) if len(parts) > 2 else 0 - return (major, minor, patch) - except (ValueError, IndexError): - return (0, 0, 0) diff --git a/codex-lens/src/codexlens/storage/vector_meta_store.py b/codex-lens/src/codexlens/storage/vector_meta_store.py deleted file mode 100644 index bd466a60..00000000 --- a/codex-lens/src/codexlens/storage/vector_meta_store.py +++ /dev/null @@ -1,415 +0,0 @@ -"""Central storage for vector metadata. - -This module provides a centralized SQLite database for storing chunk metadata -associated with centralized vector indexes. Instead of traversing all _index.db -files to fetch chunk metadata, this provides O(1) lookup by chunk ID. -""" - -from __future__ import annotations - -import json -import logging -import sqlite3 -import threading -from pathlib import Path -from typing import Any, Dict, List, Optional - -from codexlens.errors import StorageError - -logger = logging.getLogger(__name__) - - -class VectorMetadataStore: - """Store and retrieve chunk metadata for centralized vector search. - - This class provides efficient storage and retrieval of chunk metadata - for the centralized vector index architecture. All chunk metadata is - stored in a single _vectors_meta.db file at the project root, enabling - fast lookups without traversing multiple _index.db files. - - Schema: - chunk_metadata: - - chunk_id: INTEGER PRIMARY KEY - Global chunk ID - - file_path: TEXT NOT NULL - Path to source file - - content: TEXT - Chunk text content - - start_line: INTEGER - Start line in source file - - end_line: INTEGER - End line in source file - - category: TEXT - Content category (code/doc) - - metadata: TEXT - JSON-encoded additional metadata - - source_index_db: TEXT - Path to source _index.db file - """ - - def __init__(self, db_path: Path | str) -> None: - """Initialize VectorMetadataStore. - - Args: - db_path: Path to SQLite database file. - """ - self.db_path = Path(db_path) - self.db_path.parent.mkdir(parents=True, exist_ok=True) - - # Thread-safe connection management - self._lock = threading.RLock() - self._local = threading.local() - - def _get_connection(self) -> sqlite3.Connection: - """Get or create a thread-local database connection. - - Each thread gets its own connection to ensure thread safety. - """ - conn = getattr(self._local, "conn", None) - if conn is None: - conn = sqlite3.connect( - str(self.db_path), - timeout=30.0, - check_same_thread=True, - ) - conn.row_factory = sqlite3.Row - conn.execute("PRAGMA journal_mode=WAL") - conn.execute("PRAGMA synchronous=NORMAL") - conn.execute("PRAGMA mmap_size=1073741824") # 1GB mmap - self._local.conn = conn - return conn - - def _ensure_schema(self) -> None: - """Create tables if they don't exist.""" - with self._lock: - conn = self._get_connection() - try: - conn.execute(''' - CREATE TABLE IF NOT EXISTS chunk_metadata ( - chunk_id INTEGER PRIMARY KEY, - file_path TEXT NOT NULL, - content TEXT, - start_line INTEGER, - end_line INTEGER, - category TEXT, - metadata TEXT, - source_index_db TEXT - ) - ''') - conn.execute( - 'CREATE INDEX IF NOT EXISTS idx_chunk_file_path ' - 'ON chunk_metadata(file_path)' - ) - conn.execute( - 'CREATE INDEX IF NOT EXISTS idx_chunk_category ' - 'ON chunk_metadata(category)' - ) - # Binary vectors table for cascade search - conn.execute(''' - CREATE TABLE IF NOT EXISTS binary_vectors ( - chunk_id INTEGER PRIMARY KEY, - vector BLOB NOT NULL - ) - ''') - conn.commit() - logger.debug("VectorMetadataStore schema created/verified") - except sqlite3.Error as e: - raise StorageError( - f"Failed to create schema: {e}", - db_path=str(self.db_path), - operation="_ensure_schema" - ) from e - - def add_chunk( - self, - chunk_id: int, - file_path: str, - content: str, - start_line: Optional[int] = None, - end_line: Optional[int] = None, - category: Optional[str] = None, - metadata: Optional[Dict[str, Any]] = None, - source_index_db: Optional[str] = None, - ) -> None: - """Add a single chunk's metadata. - - Args: - chunk_id: Global unique chunk ID. - file_path: Path to source file. - content: Chunk text content. - start_line: Start line in source file. - end_line: End line in source file. - category: Content category (code/doc). - metadata: Additional metadata dictionary. - source_index_db: Path to source _index.db file. - """ - with self._lock: - conn = self._get_connection() - try: - metadata_json = json.dumps(metadata) if metadata else None - conn.execute( - ''' - INSERT OR REPLACE INTO chunk_metadata - (chunk_id, file_path, content, start_line, end_line, - category, metadata, source_index_db) - VALUES (?, ?, ?, ?, ?, ?, ?, ?) - ''', - (chunk_id, file_path, content, start_line, end_line, - category, metadata_json, source_index_db) - ) - conn.commit() - except sqlite3.Error as e: - raise StorageError( - f"Failed to add chunk {chunk_id}: {e}", - db_path=str(self.db_path), - operation="add_chunk" - ) from e - - def add_chunks(self, chunks: List[Dict[str, Any]]) -> None: - """Batch insert chunk metadata. - - Args: - chunks: List of dictionaries with keys: - - chunk_id (required): Global unique chunk ID - - file_path (required): Path to source file - - content: Chunk text content - - start_line: Start line in source file - - end_line: End line in source file - - category: Content category (code/doc) - - metadata: Additional metadata dictionary - - source_index_db: Path to source _index.db file - """ - if not chunks: - return - - with self._lock: - conn = self._get_connection() - try: - batch_data = [] - for chunk in chunks: - metadata = chunk.get("metadata") - metadata_json = json.dumps(metadata) if metadata else None - batch_data.append(( - chunk["chunk_id"], - chunk["file_path"], - chunk.get("content"), - chunk.get("start_line"), - chunk.get("end_line"), - chunk.get("category"), - metadata_json, - chunk.get("source_index_db"), - )) - - conn.executemany( - ''' - INSERT OR REPLACE INTO chunk_metadata - (chunk_id, file_path, content, start_line, end_line, - category, metadata, source_index_db) - VALUES (?, ?, ?, ?, ?, ?, ?, ?) - ''', - batch_data - ) - conn.commit() - logger.debug("Batch inserted %d chunk metadata records", len(chunks)) - except sqlite3.Error as e: - raise StorageError( - f"Failed to batch insert chunks: {e}", - db_path=str(self.db_path), - operation="add_chunks" - ) from e - - def get_chunks_by_ids( - self, - chunk_ids: List[int], - category: Optional[str] = None, - ) -> List[Dict[str, Any]]: - """Retrieve chunks by their IDs - the key optimization. - - This is the primary method that replaces traversing all _index.db files. - Provides O(1) lookup by chunk ID instead of O(n) where n is the number - of index databases. - - Args: - chunk_ids: List of chunk IDs to retrieve. - category: Optional category filter ('code' or 'doc'). - - Returns: - List of dictionaries with chunk metadata: - - chunk_id: Global chunk ID - - file_path: Path to source file - - content: Chunk text content - - start_line: Start line in source file - - end_line: End line in source file - - category: Content category - - metadata: Parsed metadata dictionary - - source_index_db: Source _index.db path - """ - if not chunk_ids: - return [] - - # No lock needed for reads: WAL mode + thread-local connections ensure safety - conn = self._get_connection() - try: - placeholders = ",".join("?" * len(chunk_ids)) - - if category: - query = f''' - SELECT chunk_id, file_path, content, start_line, end_line, - category, metadata, source_index_db - FROM chunk_metadata - WHERE chunk_id IN ({placeholders}) AND category = ? - ''' - params = list(chunk_ids) + [category] - else: - query = f''' - SELECT chunk_id, file_path, content, start_line, end_line, - category, metadata, source_index_db - FROM chunk_metadata - WHERE chunk_id IN ({placeholders}) - ''' - params = list(chunk_ids) - - rows = conn.execute(query, params).fetchall() - - results = [] - for row in rows: - metadata = None - if row["metadata"]: - try: - metadata = json.loads(row["metadata"]) - except json.JSONDecodeError: - metadata = {} - - results.append({ - "chunk_id": row["chunk_id"], - "file_path": row["file_path"], - "content": row["content"], - "start_line": row["start_line"], - "end_line": row["end_line"], - "category": row["category"], - "metadata": metadata or {}, - "source_index_db": row["source_index_db"], - }) - - return results - - except sqlite3.Error as e: - logger.error("Failed to get chunks by IDs: %s", e) - return [] - - def get_chunk_count(self) -> int: - """Get total number of chunks in store. - - Returns: - Total chunk count. - """ - # No lock needed for reads: WAL mode + thread-local connections ensure safety - conn = self._get_connection() - try: - row = conn.execute( - "SELECT COUNT(*) FROM chunk_metadata" - ).fetchone() - return row[0] if row else 0 - except sqlite3.Error: - return 0 - - def clear(self) -> None: - """Clear all metadata.""" - with self._lock: - conn = self._get_connection() - try: - conn.execute("DELETE FROM chunk_metadata") - conn.commit() - logger.info("Cleared all chunk metadata") - except sqlite3.Error as e: - raise StorageError( - f"Failed to clear metadata: {e}", - db_path=str(self.db_path), - operation="clear" - ) from e - - def close(self) -> None: - """Close database connection.""" - with self._lock: - conn = getattr(self._local, "conn", None) - if conn is not None: - conn.close() - self._local.conn = None - - def __enter__(self) -> "VectorMetadataStore": - """Context manager entry.""" - self._ensure_schema() - return self - - def __exit__(self, exc_type, exc_val, exc_tb) -> None: - """Context manager exit.""" - self.close() - - # ============= Binary Vector Methods for Cascade Search ============= - - def add_binary_vectors( - self, chunk_ids: List[int], binary_vectors: List[bytes] - ) -> None: - """Batch insert binary vectors for cascade search. - - Args: - chunk_ids: List of chunk IDs. - binary_vectors: List of packed binary vectors (as bytes). - """ - if not chunk_ids or len(chunk_ids) != len(binary_vectors): - return - - with self._lock: - conn = self._get_connection() - try: - data = list(zip(chunk_ids, binary_vectors)) - conn.executemany( - "INSERT OR REPLACE INTO binary_vectors (chunk_id, vector) VALUES (?, ?)", - data - ) - conn.commit() - logger.debug("Added %d binary vectors", len(chunk_ids)) - except sqlite3.Error as e: - raise StorageError( - f"Failed to add binary vectors: {e}", - db_path=str(self.db_path), - operation="add_binary_vectors" - ) from e - - def get_all_binary_vectors(self) -> List[tuple]: - """Get all binary vectors for cascade search. - - Returns: - List of (chunk_id, vector_bytes) tuples. - """ - conn = self._get_connection() - try: - rows = conn.execute( - "SELECT chunk_id, vector FROM binary_vectors" - ).fetchall() - return [(row[0], row[1]) for row in rows] - except sqlite3.Error as e: - logger.error("Failed to get binary vectors: %s", e) - return [] - - def get_binary_vector_count(self) -> int: - """Get total number of binary vectors. - - Returns: - Binary vector count. - """ - conn = self._get_connection() - try: - row = conn.execute( - "SELECT COUNT(*) FROM binary_vectors" - ).fetchone() - return row[0] if row else 0 - except sqlite3.Error: - return 0 - - def clear_binary_vectors(self) -> None: - """Clear all binary vectors.""" - with self._lock: - conn = self._get_connection() - try: - conn.execute("DELETE FROM binary_vectors") - conn.commit() - logger.info("Cleared all binary vectors") - except sqlite3.Error as e: - raise StorageError( - f"Failed to clear binary vectors: {e}", - db_path=str(self.db_path), - operation="clear_binary_vectors" - ) from e diff --git a/codex-lens/src/codexlens/tools/__init__.py b/codex-lens/src/codexlens/tools/__init__.py deleted file mode 100644 index cb60ab7e..00000000 --- a/codex-lens/src/codexlens/tools/__init__.py +++ /dev/null @@ -1,226 +0,0 @@ -"""DeepWiki document generation tools. - -This module provides tools for generating documentation from source code. -""" - -from __future__ import annotations - -import hashlib -import logging -import os -from pathlib import Path -from typing import Dict, List, Optional, Protocol, Any - -from codexlens.errors import StorageError -from codexlens.indexing.symbol_extractor import SymbolExtractor -from codexlens.parsers.factory import ParserFactory -from codexlens.storage.deepwiki_models import DeepWikiSymbol -from codexlens.storage.deepwiki_store import DeepWikiStore - -logger = logging.getLogger(__name__) - - -# Default timeout for AI generation (30 seconds) -AI_TIMEOUT = 30 -# HTML metadata markers for documentation -SYMBOL_START_MARKER = '' -SYMBOL_END_MARKER = "" - - -class MarkdownGenerator(Protocol): - """Protocol for generating Markdown documentation.""" - - def generate(self, symbol: DeepWikiSymbol, source_code: str) -> str: - """Generate Markdown documentation for a symbol. - - Args: - symbol: The symbol information - source_code: The source code content - - Returns: - Generated Markdown documentation - """ - pass - - -class MockMarkdownGenerator(MarkdownGenerator): - """Mock Markdown generator for testing.""" - - def generate(self, symbol: DeepWikiSymbol, source_code: str) -> str: - """Generate mock Markdown documentation.""" - return f"# {symbol.name}\\n\\n## {symbol.type}\\n\\n```\\n{source_code}\\n```" - - -class DeepWikiGenerator: - """Main generator for DeepWiki documentation. - - Scans source code, generates documentation with incremental updates - using SHA256 hashes for change detection. - """ - - DEFAULT_DB_PATH = DeepWikiStore.DEFAULT_DB_PATH - SUPPORTED_EXTENSIONS = [ - ".py", - ".ts", - ".tsx", - ".js", - ".jsx", - ".java", - ".go", - ".rs", - ".swift", - ] - AI_TIMEOUT: int = 30 # Timeout for AI generation - MAX_SYMBOLS_PER_FILE: int = 100 # Batch size for processing large files - - def __init__( - self, - db_path: Path | None = None, - store: DeepWikiStore | None = None, - markdown_generator: MarkdownGenerator | None = None, - max_symbols_per_file: int = 100, - ai_timeout: int = 30, - ) -> None: - """ - Initializes the DeepWikiGenerator. - """ - if store: - self.store = store - else: - self.store = DeepWikiStore(db_path or self.DEFAULT_DB_PATH) - - if markdown_generator: - self.markdown_generator = markdown_generator - else: - logger.debug("No markdown generator provided, using mock") - self.markdown_generator = MockMarkdownGenerator() - - self._extractor = SymbolExtractor() - self.max_symbols_per_file = max_symbols_per_file - self.ai_timeout = ai_timeout - self._docs_dir = Path("docs") # Default docs directory - - def _calculate_file_hash(self, file_path: Path) -> str: - """Calculate SHA256 hash of file content.""" - try: - content = file_path.read_bytes() - hash_obj = hashlib.sha256(content) - return hash_obj.hexdigest() - except IOError as e: - logger.error(f"Error reading file for hash calculation: {file_path}: {e}") - return "" - - def _get_language(self, file_path: Path) -> str | None: - """Determine language from file extension.""" - ext = file_path.suffix.lower() - if ext not in self.SUPPORTED_EXTENSIONS: - logger.debug(f"Unsupported file extension: {file_path}, skipping file") - return None - - language_map = { - ".py": "Python", - ".ts": "TypeScript", - ".tsx": "TypeScript React", - ".js": "JavaScript", - ".jsx": "JavaScript React", - ".java": "Java", - ".go": "Go", - ".rs": "Rust", - ".swift": "Swift", - } - return language_map.get(ext) - - def _should_process_file(self, file_path: Path, force: bool) -> bool: - """Check if a file should be processed based on hash.""" - if force: - return True - new_hash = self._calculate_file_hash(file_path) - if not new_hash: - return False - - existing_file = self.store.get_file(str(file_path)) - if existing_file and existing_file.content_hash == new_hash: - logger.debug(f"File unchanged: {file_path}. Skipping (hash match)") - return False - return True - - def _generate_markdown_for_symbol(self, symbol: DeepWikiSymbol, source_code: str) -> str: - """Generate markdown and wrap it with markers.""" - markdown_content = self.markdown_generator.generate(symbol, source_code) - return f"{SYMBOL_START_MARKER.format(symbol_name=symbol.name)}\\n{markdown_content}\\n{SYMBOL_END_MARKER}" - - def run(self, path: str, output_dir: Optional[str] = None, force: bool = False) -> Dict[str, Any]: - """ - Initialize DeepWiki store and generator, and scan the source. - """ - source_root = Path(path) - if output_dir: - self._docs_dir = Path(output_dir) - - stats = { - "total_files": 0, - "total_symbols": 0, - "total_changed_files": 0, - "total_changed_symbols": 0, - "total_docs_generated": 0, - "total_unchanged_files": 0, - } - - files_to_process = [p for p in source_root.rglob("*") if p.is_file() and p.suffix in self.SUPPORTED_EXTENSIONS] - stats["total_files"] = len(files_to_process) - - changed_files_count = 0 - unchanged_files_count = 0 - - for file_path in files_to_process: - if not self._should_process_file(file_path, force): - unchanged_files_count += 1 - continue - - changed_files_count += 1 - try: - source_code = file_path.read_text("utf-8") - symbols = self._extractor.extract_symbols(source_code, file_path.suffix, str(file_path)) - - if not symbols: - logger.debug(f"No symbols found in {file_path}") - continue - - logger.debug(f"Found {len(symbols)} symbols in {file_path}") - stats["total_symbols"] += len(symbols) - docs_generated_count = 0 - - for symbol in symbols: - # Generate documentation - doc_content = self._generate_markdown_for_symbol(symbol, source_code) - - # Define doc path - relative_path = file_path.relative_to(source_root) - doc_path = (self._docs_dir / relative_path).with_suffix(".md") - doc_path.parent.mkdir(parents=True, exist_ok=True) - - # Save symbol and doc - self.store.save_symbol(symbol, str(doc_path), doc_content) - docs_generated_count += 1 - - stats["total_docs_generated"] += docs_generated_count - stats["total_changed_symbols"] += len(symbols) - - # Update file stats in DB - content_hash = self._calculate_file_hash(file_path) - self.store.update_file_stats(str(file_path), len(symbols), content_hash) - logger.debug(f"Generated docs for {len(symbols)} symbols in {file_path}") - - except Exception as e: - logger.error(f"Error processing file {file_path}: {e}") - raise StorageError(f"Failed to process {file_path}") from e - - stats["total_changed_files"] = changed_files_count - stats["total_unchanged_files"] = unchanged_files_count - - logger.info(f"Generation complete. Stats: {stats}") - return stats - - def close(self): - """Close the store connection.""" - self.store.close() diff --git a/codex-lens/src/codexlens/tools/deepwiki_generator.py b/codex-lens/src/codexlens/tools/deepwiki_generator.py deleted file mode 100644 index e8a33227..00000000 --- a/codex-lens/src/codexlens/tools/deepwiki_generator.py +++ /dev/null @@ -1,1067 +0,0 @@ -"""DeepWiki document generation tools. - -This module provides tools for generating documentation from source code. -""" - -from __future__ import annotations - -import hashlib -import logging -import shlex -import signal -import subprocess -import sys -import threading -import time -from dataclasses import dataclass, field -from pathlib import Path -from typing import List, Dict, Optional, Protocol, Any, Tuple, Set - -from codexlens.storage.deepwiki_store import DeepWikiStore -from codexlens.storage.deepwiki_models import DeepWikiSymbol, DeepWikiFile, DeepWikiDoc - -logger = logging.getLogger(__name__) - -# HTML metadata markers for documentation -SYMBOL_START_TEMPLATE = '' -SYMBOL_END_MARKER = "" - - -class MarkdownGenerator(Protocol): - """Protocol for generating Markdown documentation.""" - - def generate(self, symbol: DeepWikiSymbol, source_code: str) -> str: - """Generate Markdown documentation for a symbol.""" - ... - - -class MockMarkdownGenerator: - """Mock Markdown generator for testing.""" - - def generate(self, symbol: DeepWikiSymbol, source_code: str) -> str: - """Generate mock Markdown documentation.""" - start_line, end_line = symbol.line_range - return f"""{SYMBOL_START_TEMPLATE.format(name=symbol.name, type=symbol.type)} - -## `{symbol.name}` - -**Type**: {symbol.type} -**Location**: `{symbol.source_file}:{start_line}-{end_line}` - -```{symbol.source_file.split('.')[-1] if '.' in symbol.source_file else 'text'} -{source_code} -``` - -{SYMBOL_END_MARKER} -""" - - -class DeepWikiGenerator: - """Main generator for DeepWiki documentation. - - Scans source code, generates documentation with incremental updates - using SHA256 hashes for change detection. - """ - - SUPPORTED_EXTENSIONS = [".py", ".ts", ".tsx", ".js", ".jsx", ".java", ".go", ".rs", ".swift"] - - def __init__( - self, - store: DeepWikiStore | None = None, - markdown_generator: MarkdownGenerator | None = None, - ) -> None: - """Initialize the generator. - - Args: - store: DeepWiki storage instance - markdown_generator: Markdown generator for documentation - """ - self.store = store or DeepWikiStore() - self.markdown_generator = markdown_generator or MockMarkdownGenerator() - - def calculate_file_hash(self, file_path: Path) -> str: - """Calculate SHA256 hash of a file. - - Args: - file_path: Path to the source file - - Returns: - SHA256 hash string - """ - content = file_path.read_bytes() - return hashlib.sha256(content).hexdigest() - - def _should_process_file(self, file_path: Path) -> bool: - """Check if a file should be processed based on extension.""" - return file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS - - def _extract_symbols_simple(self, file_path: Path) -> List[Dict[str, Any]]: - """Extract symbols from a file using simple regex patterns. - - Args: - file_path: Path to the source file - - Returns: - List of symbol dictionaries - """ - import re - - content = file_path.read_text(encoding="utf-8", errors="ignore") - lines = content.split("\n") - symbols = [] - - # Python patterns - py_patterns = [ - (r"^(\s*)def\s+(\w+)\s*\(", "function"), - (r"^(\s*)async\s+def\s+(\w+)\s*\(", "async_function"), - (r"^(\s*)class\s+(\w+)", "class"), - ] - - # TypeScript/JavaScript patterns - ts_patterns = [ - (r"^(\s*)function\s+(\w+)\s*\(", "function"), - (r"^(\s*)const\s+(\w+)\s*=\s*(?:async\s*)?\(", "function"), - (r"^(\s*)export\s+(?:async\s+)?function\s+(\w+)", "function"), - (r"^(\s*)class\s+(\w+)", "class"), - (r"^(\s*)interface\s+(\w+)", "interface"), - ] - - all_patterns = py_patterns + ts_patterns - - for i, line in enumerate(lines, 1): - for pattern, symbol_type in all_patterns: - match = re.match(pattern, line) - if match: - name = match.group(2) - # Find end line (simple heuristic: next def/class or EOF) - end_line = i - for j in range(i, min(i + 50, len(lines) + 1)): - if j > i: - for p, _ in all_patterns: - if re.match(p, lines[j - 1]) and not lines[j - 1].startswith(match.group(1)): - end_line = j - 1 - break - else: - continue - break - else: - end_line = min(i + 30, len(lines)) - - symbols.append({ - "name": name, - "type": symbol_type, - "line_start": i, - "line_end": end_line, - "source": "\n".join(lines[i - 1:end_line]), - }) - break - - return symbols - - def generate_for_file(self, file_path: Path) -> Dict[str, Any]: - """Generate documentation for a single file. - - Args: - file_path: Path to the source file - - Returns: - Generation result dictionary - """ - if not self._should_process_file(file_path): - return {"skipped": True, "reason": "unsupported_extension"} - - # Calculate hash and check for changes - current_hash = self.calculate_file_hash(file_path) - existing_file = self.store.get_file(str(file_path)) - - if existing_file and existing_file.content_hash == current_hash: - logger.debug(f"File unchanged: {file_path}") - return {"skipped": True, "reason": "unchanged", "hash": current_hash} - - # Extract symbols - raw_symbols = self._extract_symbols_simple(file_path) - - if not raw_symbols: - logger.debug(f"No symbols found in: {file_path}") - return {"skipped": True, "reason": "no_symbols", "hash": current_hash} - - # Generate documentation for each symbol - docs_generated = 0 - for sym in raw_symbols: - # Create symbol record - symbol = DeepWikiSymbol( - name=sym["name"], - type=sym["type"], - source_file=str(file_path), - doc_file=f".deepwiki/{file_path.stem}.md", - anchor=f"#{sym['name'].lower()}", - line_range=(sym["line_start"], sym["line_end"]), - ) - - # Generate markdown - markdown = self.markdown_generator.generate(symbol, sym["source"]) - - # Save to store - self.store.add_symbol(symbol) - docs_generated += 1 - - # Track file hash + metadata for incremental updates and staleness checks. - self.store.add_file( - file_path=str(file_path), - content_hash=current_hash, - symbols_count=len(raw_symbols), - docs_generated=docs_generated > 0, - ) - - logger.info(f"Generated docs for {docs_generated} symbols in {file_path}") - return { - "symbols": len(raw_symbols), - "docs_generated": docs_generated, - "hash": current_hash, - } - - def run(self, path: Path) -> Dict[str, Any]: - """Run documentation generation for a path. - - Args: - path: File or directory path to process - - Returns: - Generation summary - """ - path = Path(path) - - if path.is_file(): - files = [path] - elif path.is_dir(): - files = [] - for ext in self.SUPPORTED_EXTENSIONS: - files.extend(path.rglob(f"*{ext}")) - else: - raise ValueError(f"Path not found: {path}") - - results = { - "total_files": 0, - "processed_files": 0, - "skipped_files": 0, - "total_symbols": 0, - "docs_generated": 0, - } - - for file_path in files: - results["total_files"] += 1 - result = self.generate_for_file(file_path) - - if result.get("skipped"): - results["skipped_files"] += 1 - else: - results["processed_files"] += 1 - results["total_symbols"] += result.get("symbols", 0) - results["docs_generated"] += result.get("docs_generated", 0) - - logger.info( - f"DeepWiki generation complete: " - f"{results['processed_files']}/{results['total_files']} files, " - f"{results['docs_generated']} docs generated" - ) - - return results - - -# ============================================================================= -# TASK-002: LLMMarkdownGenerator Core Class -# ============================================================================= - -@dataclass -class GenerationResult: - """Result of a documentation generation attempt.""" - success: bool - content: Optional[str] = None - tool: Optional[str] = None - attempts: int = 0 - error: Optional[str] = None - symbol: Optional[DeepWikiSymbol] = None - - -@dataclass -class GeneratorConfig: - """Configuration for LLM generator.""" - max_concurrent: int = 4 - batch_size: int = 4 - graceful_shutdown: bool = True - - -# Tool fallback chains: primary -> secondary -> tertiary -TOOL_CHAIN: Dict[str, List[str]] = { - "gemini": ["gemini", "qwen", "codex"], - "qwen": ["qwen", "gemini", "codex"], - "codex": ["codex", "gemini", "qwen"], -} - -# Layer-based timeout settings (seconds) -TOOL_TIMEOUTS: Dict[str, Dict[str, int]] = { - "gemini": {"layer3": 120, "layer2": 60, "layer1": 30}, - "qwen": {"layer3": 90, "layer2": 45, "layer1": 20}, - "codex": {"layer3": 180, "layer2": 90, "layer1": 45}, -} - -# Required sections per layer for validation -REQUIRED_SECTIONS: Dict[int, List[str]] = { - 3: ["Description", "Parameters", "Returns", "Example"], - 2: ["Description", "Returns"], - 1: ["Description"], -} - - -class LLMMarkdownGenerator: - """LLM-powered Markdown generator with tool fallback and retry logic. - - Implements the MarkdownGenerator protocol with: - - Tool fallback chain (gemini -> qwen -> codex) - - Layer-based timeouts - - SHA256 incremental updates - - Structure validation - """ - - def __init__( - self, - primary_tool: str = "gemini", - db: DeepWikiStore | None = None, - force_mode: bool = False, - progress_tracker: Optional[Any] = None, - ) -> None: - """Initialize LLM generator. - - Args: - primary_tool: Primary LLM tool to use (gemini/qwen/codex). - db: DeepWikiStore instance for progress tracking. - force_mode: If True, regenerate all docs regardless of hash. - progress_tracker: Optional ProgressTracker for timeout alerts. - """ - self.primary_tool = primary_tool - self.db = db or DeepWikiStore() - self.force_mode = force_mode - self.progress_tracker = progress_tracker - self._ensure_db_initialized() - - def _ensure_db_initialized(self) -> None: - """Ensure database is initialized.""" - try: - self.db.initialize() - except Exception: - pass # Already initialized - - def _classify_layer(self, symbol: DeepWikiSymbol) -> int: - """Classify symbol into layer (1, 2, or 3). - - Layer 3: class, function, async_function, interface (detailed docs) - Layer 2: method, property (compact docs) - Layer 1: variable, constant (minimal docs) - """ - symbol_type = symbol.type.lower() - if symbol_type in ("class", "function", "async_function", "interface"): - return 3 - elif symbol_type in ("method", "property"): - return 2 - else: - return 1 - - def _build_prompt(self, symbol: DeepWikiSymbol, source_code: str, layer: int) -> str: - """Build LLM prompt based on symbol layer. - - Args: - symbol: Symbol to document. - source_code: Source code of the symbol. - layer: Layer (1, 2, or 3) determining prompt template. - - Returns: - Prompt string for the LLM. - """ - file_ext = Path(symbol.source_file).suffix.lstrip(".") - - if layer == 3: - # Full documentation template - return f"""Generate comprehensive Markdown documentation for this code symbol. - -## Symbol Information -- Name: {symbol.name} -- Type: {symbol.type} -- File: {symbol.source_file} -- Lines: {symbol.line_range[0]}-{symbol.line_range[1]} - -## Source Code -```{file_ext} -{source_code} -``` - -## Required Sections -Generate a Markdown document with these sections: -1. **Description** - Clear description of what this symbol does -2. **Parameters** - List all parameters with types and descriptions -3. **Returns** - What this symbol returns (if applicable) -4. **Example** - Code example showing usage - -Format the output as clean Markdown. Use code fences for code blocks.""" - - elif layer == 2: - # Compact documentation template - return f"""Generate compact Markdown documentation for this code symbol. - -## Symbol Information -- Name: {symbol.name} -- Type: {symbol.type} -- File: {symbol.source_file} - -## Source Code -```{file_ext} -{source_code} -``` - -## Required Sections -Generate a Markdown document with these sections: -1. **Description** - Brief description of this symbol's purpose -2. **Returns** - Return value description (if applicable) - -Keep it concise. Format as clean Markdown.""" - - else: - # Minimal documentation template (layer 1) - return f"""Generate minimal Markdown documentation for this code symbol. - -## Symbol Information -- Name: {symbol.name} -- Type: {symbol.type} - -## Source Code -```{file_ext} -{source_code} -``` - -## Required Sections -Generate a Markdown document with: -1. **Description** - One-line description of this symbol - -Keep it minimal. Format as clean Markdown.""" - - def _call_cli_with_timeout( - self, tool: str, prompt: str, timeout: int - ) -> str: - """Call LLM CLI tool with timeout. - - Args: - tool: CLI tool name (gemini/qwen/codex). - prompt: Prompt to send to the LLM. - timeout: Timeout in seconds. - - Returns: - Generated content string. - - Raises: - TimeoutError: If command times out. - RuntimeError: If command fails. - """ - # Build ccw cli command - escaped_prompt = prompt.replace('"', '\\"') - cmd = [ - "ccw", "cli", "-p", prompt, - "--tool", tool, - "--mode", "write", - ] - - try: - result = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=timeout, - cwd=str(Path.cwd()), - ) - - if result.returncode != 0: - raise RuntimeError(f"CLI failed: {result.stderr}") - - return result.stdout.strip() - - except subprocess.TimeoutExpired as exc: - raise TimeoutError( - f"Timeout after {timeout}s with {tool}" - ) from exc - - def _emit_timeout_alert( - self, symbol: DeepWikiSymbol, tool: str, timeout: int - ) -> None: - """Emit timeout alert to progress tracker and logs. - - Args: - symbol: Symbol that timed out. - tool: Tool that timed out. - timeout: Timeout duration in seconds. - """ - alert_msg = f"TIMEOUT: {symbol.name} ({symbol.source_file}) with {tool} after {timeout}s" - logger.warning(alert_msg) - - # Output to progress tracker if available - if self.progress_tracker: - self.progress_tracker.write_above(f"[WARNING] {alert_msg}") - - def validate_structure(self, content: str, layer: int) -> bool: - """Validate generated content has required structure. - - Args: - content: Generated markdown content. - layer: Layer (1, 2, or 3). - - Returns: - True if content passes validation, False otherwise. - """ - import re - - if not content or len(content.strip()) < 20: - return False - - required = REQUIRED_SECTIONS.get(layer, ["Description"]) - - for section in required: - # Match markdown headers (##, ###, **Bold**) or standalone section names - pattern = rf"^\s*(?:#{1,6}\s+|\*\*){re.escape(section)}" - if not re.search(pattern, content, re.IGNORECASE | re.MULTILINE): - return False - - return True - - def generate_with_retry( - self, symbol: DeepWikiSymbol, source_code: str - ) -> GenerationResult: - """Generate documentation with tool fallback chain. - - Strategy: Immediate tool fallback - - Tool A fails -> Immediately try Tool B - - All 3 tools fail -> Mark as failed - - Args: - symbol: Symbol to document. - source_code: Source code of the symbol. - - Returns: - GenerationResult with success status and content. - """ - tool_chain = TOOL_CHAIN.get(self.primary_tool, ["gemini", "qwen", "codex"]) - layer = self._classify_layer(symbol) - prompt = self._build_prompt(symbol, source_code, layer) - - symbol_key = f"{symbol.source_file}:{symbol.name}:{symbol.line_range[0]}" - last_error = None - - for attempt, tool in enumerate(tool_chain, 1): - timeout = TOOL_TIMEOUTS.get(tool, {}).get(f"layer{layer}", 60) - - try: - # Update progress - if self.db: - self.db.update_progress( - symbol_key, - { - "file_path": symbol.source_file, - "symbol_name": symbol.name, - "symbol_type": symbol.type, - "layer": layer, - "source_hash": hashlib.sha256(source_code.encode()).hexdigest(), - "status": "processing", - "attempts": attempt, - "last_tool": tool, - }, - ) - - result = self._call_cli_with_timeout(tool, prompt, timeout) - - if result and self.validate_structure(result, layer): - # Success - if self.db: - self.db.mark_completed(symbol_key, tool) - - return GenerationResult( - success=True, - content=result, - tool=tool, - attempts=attempt, - symbol=symbol, - ) - - # Invalid structure - last_error = f"Invalid structure from {tool}" - continue - - except TimeoutError: - self._emit_timeout_alert(symbol, tool, timeout) - last_error = f"Timeout after {timeout}s with {tool}" - continue - - except Exception as exc: - last_error = f"{type(exc).__name__}: {exc}" - continue - - # All tools failed - if self.db: - self.db.mark_failed(symbol_key, last_error or "All tools failed") - - return GenerationResult( - success=False, - content=None, - tool=None, - attempts=len(tool_chain), - error=last_error, - symbol=symbol, - ) - - def should_regenerate( - self, - symbol: DeepWikiSymbol, - source_code: str, - staleness_threshold: float = 0.7, - ) -> bool: - """Check if symbol needs regeneration. - - Conditions for regeneration: - 1. --force mode is enabled - 2. Symbol not in database (new) - 3. Source code hash changed - 4. Previous generation failed - 5. Staleness score exceeds threshold - - Args: - symbol: Symbol to check. - source_code: Source code of the symbol. - staleness_threshold: Score above which regeneration is triggered. - - Returns: - True if regeneration needed, False otherwise. - """ - if self.force_mode: - return True - - current_hash = hashlib.sha256(source_code.encode()).hexdigest() - symbol_key = f"{symbol.source_file}:{symbol.name}:{symbol.line_range[0]}" - - if self.db: - progress = self.db.get_progress(symbol_key) - - if not progress: - return True # New symbol - - if progress.get("source_hash") != current_hash: - return True # Code changed - - if progress.get("status") == "failed": - return True # Retry failed - - # Check staleness score from DeepWiki index - db_symbol = self.db.get_symbol(symbol.name, symbol.source_file) - if db_symbol and db_symbol.staleness_score >= staleness_threshold: - return True # Stale documentation - - return False # Skip - - def _fallback_generate( - self, symbol: DeepWikiSymbol, source_code: str - ) -> str: - """Fallback to Mock generation when all LLM tools fail. - - Args: - symbol: Symbol to document. - source_code: Source code of the symbol. - - Returns: - Mock-generated markdown content. - """ - mock = MockMarkdownGenerator() - return mock.generate(symbol, source_code) - - def generate(self, symbol: DeepWikiSymbol, source_code: str) -> str: - """Generate Markdown documentation (implements MarkdownGenerator protocol). - - Args: - symbol: Symbol to document. - source_code: Source code of the symbol. - - Returns: - Generated markdown content. - """ - result = self.generate_with_retry(symbol, source_code) - - if result.success and result.content: - return result.content - - # Fallback to mock on failure - return self._fallback_generate(symbol, source_code) - - -# ============================================================================= -# TASK-003: BatchProcessor + Graceful Interrupt -# TASK-004: ProgressTracker (rich progress bar) -# ============================================================================= - -class ProgressTracker: - """Progress tracker using rich progress bar. - - Shows real-time progress with: - - Progress bar: [=====> ] 120/500 (24%) eta: 5min - - Timeout alerts above progress bar - - Failure summary at completion - """ - - def __init__(self, total: int) -> None: - """Initialize progress tracker. - - Args: - total: Total number of symbols to process. - """ - self.total = total - self.completed = 0 - self.failed_symbols: List[Dict[str, Any]] = [] - self._lock = threading.Lock() - self._started = False - - # Lazy import rich to avoid dependency issues - try: - from rich.console import Console - from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeRemainingColumn - self._console = Console() - self._progress = Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), - TextColumn("({task.completed}/{task.total})"), - TimeRemainingColumn(), - console=self._console, - ) - self._task_id = None - self._rich_available = True - except ImportError: - self._rich_available = False - self._console = None - - def start(self) -> None: - """Start the progress bar.""" - if self._rich_available and self._progress: - self._progress.start() - self._task_id = self._progress.add_task( - "Generating docs", total=self.total - ) - self._started = True - - def update(self, symbol: DeepWikiSymbol, result: GenerationResult) -> None: - """Update progress after a symbol is processed. - - Args: - symbol: Processed symbol. - result: Generation result. - """ - with self._lock: - self.completed += 1 - - if self._rich_available and self._progress and self._task_id is not None: - self._progress.advance(self._task_id) - - if not result.success: - self.failed_symbols.append({ - "symbol": symbol.name, - "file": symbol.source_file, - "error": result.error or "Unknown error", - }) - - def write_above(self, message: str) -> None: - """Write message above the progress bar. - - Args: - message: Message to display. - """ - if self._rich_available and self._console: - self._console.print(message) - else: - print(message) - - def print_summary(self) -> None: - """Print final summary after all processing completes.""" - self.stop() - - success = self.completed - len(self.failed_symbols) - failed = len(self.failed_symbols) - - if self._rich_available and self._console: - self._console.print( - f"\n[bold]Generation complete:[/bold] " - f"[green]{success}/{self.completed}[/green] successful" - ) - - if self.failed_symbols: - self._console.print( - f"\n[bold red]Failed symbols ({failed}):[/bold red]" - ) - for item in self.failed_symbols: - self._console.print( - f" - [yellow]{item['symbol']}[/yellow] " - f"({item['file']}): {item['error']}" - ) - else: - print(f"\nGeneration complete: {success}/{self.completed} successful") - - if self.failed_symbols: - print(f"\nFailed symbols ({failed}):") - for item in self.failed_symbols: - print(f" - {item['symbol']} ({item['file']}): {item['error']}") - - def stop(self) -> None: - """Stop the progress bar.""" - if self._rich_available and self._progress and self._started: - self._progress.stop() - self._started = False - - -class BatchProcessor: - """Batch processor with concurrent execution and graceful interrupt. - - Features: - - ThreadPoolExecutor with configurable concurrency (default: 4) - - Signal handlers for Ctrl+C graceful interrupt - - Orphaned document cleanup - - Integration with ProgressTracker - """ - - def __init__( - self, - generator: LLMMarkdownGenerator, - config: GeneratorConfig | None = None, - ) -> None: - """Initialize batch processor. - - Args: - generator: LLM generator instance. - config: Generator configuration. - """ - self.generator = generator - self.config = config or GeneratorConfig() - self.shutdown_event = threading.Event() - self._executor = None - self._progress: Optional[ProgressTracker] = None - - def setup_signal_handlers(self) -> None: - """Set up signal handlers for graceful Ctrl+C interrupt.""" - def handle_sigint(signum: int, frame) -> None: - if self.shutdown_event.is_set(): - # Second Ctrl+C: force exit - print("\n[WARNING] Forced exit, progress may be lost") - sys.exit(1) - - # First Ctrl+C: graceful interrupt - print("\n[INFO] Completing current batch...") - self.shutdown_event.set() - - signal.signal(signal.SIGINT, handle_sigint) - - def process_batch( - self, symbols: List[Tuple[DeepWikiSymbol, str]] - ) -> List[GenerationResult]: - """Process a batch of symbols concurrently. - - Args: - symbols: List of (symbol, source_code) tuples. - - Returns: - List of GenerationResult for each symbol. - """ - from concurrent.futures import ThreadPoolExecutor, as_completed - - results: List[GenerationResult] = [] - futures = [] - - with ThreadPoolExecutor(max_workers=self.config.max_concurrent) as executor: - self._executor = executor - - for symbol, source_code in symbols: - if self.shutdown_event.is_set(): - break - - future = executor.submit( - self.generator.generate_with_retry, - symbol, - source_code, - ) - futures.append((symbol, future)) - - # Wait for all submitted tasks - for symbol, future in futures: - try: - result = future.result(timeout=300) # 5 min total timeout - results.append(result) - - if self._progress: - self._progress.update(symbol, result) - - except Exception as exc: - error_result = GenerationResult( - success=False, - error=str(exc), - symbol=symbol, - ) - results.append(error_result) - - if self._progress: - self._progress.update(symbol, error_result) - - return results - - def cleanup_orphaned_docs( - self, current_symbols: List[DeepWikiSymbol] - ) -> int: - """Clean up documents for symbols that no longer exist in source. - - Args: - current_symbols: List of current symbols in source code. - - Returns: - Number of orphaned documents removed. - """ - if not self.generator.db: - return 0 - - current_keys = { - f"{s.source_file}:{s.name}:{s.line_range[0]}" - for s in current_symbols - } - - stored_keys = self.generator.db.get_completed_symbol_keys() - orphaned_keys = list(stored_keys - current_keys) - - if orphaned_keys: - deleted = self.generator.db.delete_progress(orphaned_keys) - logger.info(f"Cleaned up {deleted} orphaned documents") - return deleted - - return 0 - - def run( - self, - path: Path, - tool: str = "gemini", - force: bool = False, - resume: bool = False, - ) -> Dict[str, Any]: - """Main entry point for batch processing. - - Flow: - 1. Scan source files - 2. Extract symbols - 3. SHA256 filter - 4. Layer sort (3 -> 2 -> 1) - 5. Batch process with concurrency - - Args: - path: File or directory path to process. - tool: Primary LLM tool to use. - force: Force regenerate all docs. - resume: Resume from previous interrupted run. - - Returns: - Processing summary dictionary. - """ - # Update generator settings - self.generator.primary_tool = tool - self.generator.force_mode = force - - # Setup signal handlers - if self.config.graceful_shutdown: - self.setup_signal_handlers() - - # Initialize database - self.generator._ensure_db_initialized() - - # Phase 1: Scan files - path = Path(path) - if path.is_file(): - files = [path] - elif path.is_dir(): - files = [] - for ext in DeepWikiGenerator.SUPPORTED_EXTENSIONS: - files.extend(path.rglob(f"*{ext}")) - else: - raise ValueError(f"Path not found: {path}") - - # Phase 2: Extract symbols - all_symbols: List[Tuple[DeepWikiSymbol, str]] = [] - temp_gen = DeepWikiGenerator(store=self.generator.db) - - for file_path in files: - raw_symbols = temp_gen._extract_symbols_simple(file_path) - - for sym in raw_symbols: - symbol = DeepWikiSymbol( - name=sym["name"], - symbol_type=sym["type"], - source_file=str(file_path), - doc_file=f".deepwiki/{file_path.stem}.md", - anchor=f"#{sym['name'].lower()}", - line_start=sym["line_start"], - line_end=sym["line_end"], - ) - all_symbols.append((symbol, sym["source"])) - - # Phase 3: SHA256 filter - symbols_to_process = [ - (s, c) for s, c in all_symbols - if self.generator.should_regenerate(s, c) - ] - - if not symbols_to_process: - logger.info("All symbols up to date, nothing to process") - return { - "total_symbols": len(all_symbols), - "processed": 0, - "skipped": len(all_symbols), - "success": 0, - "failed": 0, - } - - # Phase 4: Cleanup orphaned docs - current_symbols = [s for s, _ in all_symbols] - orphaned = self.cleanup_orphaned_docs(current_symbols) - - # Phase 5: Sort by layer (3 -> 2 -> 1) - symbols_to_process.sort( - key=lambda x: self.generator._classify_layer(x[0]), - reverse=True - ) - - # Phase 6: Initialize progress tracker - self._progress = ProgressTracker(total=len(symbols_to_process)) - self.generator.progress_tracker = self._progress - self._progress.start() - - # Phase 7: Batch process - all_results: List[GenerationResult] = [] - batch_size = self.config.batch_size - - for i in range(0, len(symbols_to_process), batch_size): - if self.shutdown_event.is_set(): - break - - batch = symbols_to_process[i:i + batch_size] - results = self.process_batch(batch) - all_results.extend(results) - - # Phase 8: Print summary - if self._progress: - self._progress.print_summary() - - # Calculate statistics - success_count = sum(1 for r in all_results if r.success) - failed_count = len(all_results) - success_count - - return { - "total_symbols": len(all_symbols), - "processed": len(all_results), - "skipped": len(all_symbols) - len(symbols_to_process), - "success": success_count, - "failed": failed_count, - "orphaned_cleaned": orphaned, - } diff --git a/codex-lens/src/codexlens/watcher/__init__.py b/codex-lens/src/codexlens/watcher/__init__.py deleted file mode 100644 index 4c095ec4..00000000 --- a/codex-lens/src/codexlens/watcher/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -"""File watcher module for real-time index updates.""" - -from .events import ChangeType, FileEvent, IndexResult, WatcherConfig, WatcherStats -from .file_watcher import FileWatcher -from .incremental_indexer import IncrementalIndexer -from .manager import WatcherManager - -__all__ = [ - "ChangeType", - "FileEvent", - "IndexResult", - "WatcherConfig", - "WatcherStats", - "FileWatcher", - "IncrementalIndexer", - "WatcherManager", -] diff --git a/codex-lens/src/codexlens/watcher/events.py b/codex-lens/src/codexlens/watcher/events.py deleted file mode 100644 index edb43787..00000000 --- a/codex-lens/src/codexlens/watcher/events.py +++ /dev/null @@ -1,82 +0,0 @@ -"""Event types for file watcher.""" - -from __future__ import annotations - -import time -from dataclasses import dataclass, field -from enum import Enum -from pathlib import Path -from typing import List, Optional, Set - - -class ChangeType(Enum): - """Type of file system change.""" - CREATED = "created" - MODIFIED = "modified" - DELETED = "deleted" - MOVED = "moved" - - -@dataclass -class FileEvent: - """A file system change event.""" - path: Path - change_type: ChangeType - timestamp: float - old_path: Optional[Path] = None # For MOVED events - - -@dataclass -class WatcherConfig: - """Configuration for file watcher.""" - debounce_ms: int = 60000 # Default 60 seconds for debounce - ignored_patterns: Set[str] = field(default_factory=lambda: { - # Version control - ".git", ".svn", ".hg", - # Python environments & cache - ".venv", "venv", "env", "__pycache__", ".pytest_cache", ".mypy_cache", ".ruff_cache", - # Node.js - "node_modules", "bower_components", ".npm", ".yarn", - # Build artifacts - "dist", "build", "out", "target", "bin", "obj", "_build", "coverage", "htmlcov", - # IDE & Editor - ".idea", ".vscode", ".vs", ".eclipse", - # CodexLens internal - ".codexlens", - # Package manager caches - ".cache", ".parcel-cache", ".turbo", ".next", ".nuxt", - # Logs & temp - "logs", "tmp", "temp", - }) - languages: Optional[List[str]] = None # None = all supported - - -@dataclass -class PendingQueueStatus: - """Status of pending file changes queue.""" - file_count: int = 0 - files: List[str] = field(default_factory=list) # Limited to 20 files - countdown_seconds: int = 0 - last_event_time: Optional[float] = None - - -@dataclass -class IndexResult: - """Result of processing file changes.""" - files_indexed: int = 0 - files_removed: int = 0 - symbols_added: int = 0 - symbols_removed: int = 0 - files_success: List[str] = field(default_factory=list) - files_failed: List[str] = field(default_factory=list) - errors: List[str] = field(default_factory=list) - timestamp: float = field(default_factory=time.time) - - -@dataclass -class WatcherStats: - """Runtime statistics for watcher.""" - files_watched: int = 0 - events_processed: int = 0 - last_event_time: Optional[float] = None - is_running: bool = False diff --git a/codex-lens/src/codexlens/watcher/file_watcher.py b/codex-lens/src/codexlens/watcher/file_watcher.py deleted file mode 100644 index 4fc50691..00000000 --- a/codex-lens/src/codexlens/watcher/file_watcher.py +++ /dev/null @@ -1,347 +0,0 @@ -"""File system watcher using watchdog library.""" - -from __future__ import annotations - -import logging -import threading -import time -from pathlib import Path -from typing import Callable, Dict, List, Optional - -from watchdog.observers import Observer -from watchdog.events import FileSystemEventHandler - -from .events import ChangeType, FileEvent, WatcherConfig, PendingQueueStatus -from ..config import Config - -logger = logging.getLogger(__name__) - -# Maximum queue size to prevent unbounded memory growth -# When exceeded, forces immediate flush to avoid memory exhaustion -MAX_QUEUE_SIZE = 50000 - - -class _CodexLensHandler(FileSystemEventHandler): - """Internal handler for watchdog events.""" - - def __init__( - self, - watcher: "FileWatcher", - on_event: Callable[[FileEvent], None], - ) -> None: - super().__init__() - self._watcher = watcher - self._on_event = on_event - - def on_created(self, event) -> None: - if event.is_directory: - return - self._emit(event.src_path, ChangeType.CREATED) - - def on_modified(self, event) -> None: - if event.is_directory: - return - self._emit(event.src_path, ChangeType.MODIFIED) - - def on_deleted(self, event) -> None: - if event.is_directory: - return - self._emit(event.src_path, ChangeType.DELETED) - - def on_moved(self, event) -> None: - if event.is_directory: - return - self._emit(event.dest_path, ChangeType.MOVED, old_path=event.src_path) - - def _emit( - self, - path: str, - change_type: ChangeType, - old_path: Optional[str] = None, - ) -> None: - path_obj = Path(path) - - # Filter out files that should not be indexed - if not self._watcher._should_index_file(path_obj): - return - - event = FileEvent( - path=path_obj, - change_type=change_type, - timestamp=time.time(), - old_path=Path(old_path) if old_path else None, - ) - self._on_event(event) - - -class FileWatcher: - """File system watcher for monitoring directory changes. - - Uses watchdog library for cross-platform file system monitoring. - Events are forwarded to the on_changes callback. - - Example: - def handle_changes(events: List[FileEvent]) -> None: - for event in events: - print(f"{event.change_type}: {event.path}") - - watcher = FileWatcher(Path("."), WatcherConfig(), handle_changes) - watcher.start() - watcher.wait() # Block until stopped - """ - - def __init__( - self, - root_path: Path, - config: WatcherConfig, - on_changes: Callable[[List[FileEvent]], None], - ) -> None: - """Initialize file watcher. - - Args: - root_path: Directory to watch recursively - config: Watcher configuration - on_changes: Callback invoked with batched events - """ - self.root_path = Path(root_path).resolve() - self.config = config - self.on_changes = on_changes - - self._observer: Optional[Observer] = None - self._running = False - self._stop_event = threading.Event() - self._lock = threading.RLock() - - # Event queue for batching - self._event_queue: List[FileEvent] = [] - self._queue_lock = threading.Lock() - - # Debounce timer (true debounce - waits after last event) - self._flush_timer: Optional[threading.Timer] = None - self._last_event_time: float = 0 - - # Queue change callbacks for real-time UI updates - self._queue_change_callbacks: List[Callable[[PendingQueueStatus], None]] = [] - - # Config instance for language checking - self._codexlens_config = Config() - - def _should_index_file(self, path: Path) -> bool: - """Check if file should be indexed based on extension and ignore patterns. - - Args: - path: File path to check - - Returns: - True if file should be indexed, False otherwise - """ - # Check against ignore patterns - parts = path.parts - for pattern in self.config.ignored_patterns: - if pattern in parts: - return False - - # Check extension against supported languages - language = self._codexlens_config.language_for_path(path) - return language is not None - - def _on_raw_event(self, event: FileEvent) -> None: - """Handle raw event from watchdog handler with true debounce.""" - force_flush = False - - with self._queue_lock: - # Check queue size limit to prevent memory exhaustion - if len(self._event_queue) >= MAX_QUEUE_SIZE: - logger.warning( - "Event queue limit (%d) reached, forcing immediate flush", - MAX_QUEUE_SIZE - ) - if self._flush_timer: - self._flush_timer.cancel() - self._flush_timer = None - force_flush = True - - self._event_queue.append(event) - self._last_event_time = time.time() - - # Cancel previous timer and schedule new one (true debounce) - # Skip if we're about to force flush - if not force_flush: - if self._flush_timer: - self._flush_timer.cancel() - - self._flush_timer = threading.Timer( - self.config.debounce_ms / 1000.0, - self._flush_events - ) - self._flush_timer.daemon = True - self._flush_timer.start() - - # Force flush outside lock to avoid deadlock - if force_flush: - self._flush_events() - - # Notify queue change (outside lock to avoid deadlock) - self._notify_queue_change() - - def _debounce_loop(self) -> None: - """Background thread for checking flush signal file.""" - signal_file = self.root_path / '.codexlens' / 'flush.signal' - while self._running: - time.sleep(1.0) # Check every second - # Check for flush signal file - if signal_file.exists(): - try: - signal_file.unlink() - logger.info("Flush signal detected, triggering immediate index") - self.flush_now() - except Exception as e: - logger.warning("Failed to handle flush signal: %s", e) - - def _flush_events(self) -> None: - """Flush queued events with deduplication.""" - with self._queue_lock: - if not self._event_queue: - return - - # Deduplicate: keep latest event per path - deduped: Dict[Path, FileEvent] = {} - for event in self._event_queue: - deduped[event.path] = event - - events = list(deduped.values()) - self._event_queue.clear() - self._last_event_time = 0 # Reset after flush - - # Notify queue cleared - self._notify_queue_change() - - if events: - try: - self.on_changes(events) - except Exception as exc: - logger.error("Error in on_changes callback: %s", exc) - - def flush_now(self) -> None: - """Immediately flush pending queue (manual trigger).""" - with self._queue_lock: - if self._flush_timer: - self._flush_timer.cancel() - self._flush_timer = None - self._flush_events() - - def get_pending_queue_status(self) -> PendingQueueStatus: - """Get current pending queue status for UI display.""" - with self._queue_lock: - file_count = len(self._event_queue) - files = [str(e.path.name) for e in self._event_queue[:20]] - - # Calculate countdown - if self._last_event_time > 0 and file_count > 0: - elapsed = time.time() - self._last_event_time - remaining = max(0, self.config.debounce_ms / 1000.0 - elapsed) - countdown = int(remaining) - else: - countdown = 0 - - return PendingQueueStatus( - file_count=file_count, - files=files, - countdown_seconds=countdown, - last_event_time=self._last_event_time if file_count > 0 else None - ) - - def register_queue_change_callback( - self, callback: Callable[[PendingQueueStatus], None] - ) -> None: - """Register callback for queue change notifications.""" - self._queue_change_callbacks.append(callback) - - def _notify_queue_change(self) -> None: - """Notify all registered callbacks of queue change.""" - status = self.get_pending_queue_status() - for callback in self._queue_change_callbacks: - try: - callback(status) - except Exception as e: - logger.error("Queue change callback error: %s", e) - - def start(self) -> None: - """Start watching the directory. - - Non-blocking. Use wait() to block until stopped. - """ - with self._lock: - if self._running: - logger.warning("Watcher already running") - return - - if not self.root_path.exists(): - raise ValueError(f"Root path does not exist: {self.root_path}") - - self._observer = Observer() - handler = _CodexLensHandler(self, self._on_raw_event) - self._observer.schedule(handler, str(self.root_path), recursive=True) - - self._running = True - self._stop_event.clear() - self._observer.start() - - # Start signal check thread (for flush.signal file) - self._signal_check_thread = threading.Thread( - target=self._debounce_loop, - daemon=True, - name="FileWatcher-SignalCheck", - ) - self._signal_check_thread.start() - - logger.info("Started watching: %s", self.root_path) - - def stop(self) -> None: - """Stop watching the directory. - - Gracefully stops the observer and flushes remaining events. - """ - with self._lock: - if not self._running: - return - - self._running = False - self._stop_event.set() - - # Cancel pending flush timer - if self._flush_timer: - self._flush_timer.cancel() - self._flush_timer = None - - if self._observer: - self._observer.stop() - self._observer.join(timeout=5.0) - self._observer = None - - # Wait for signal check thread to finish - if hasattr(self, '_signal_check_thread') and self._signal_check_thread and self._signal_check_thread.is_alive(): - self._signal_check_thread.join(timeout=2.0) - self._signal_check_thread = None - - # Flush any remaining events - self._flush_events() - - logger.info("Stopped watching: %s", self.root_path) - - def wait(self) -> None: - """Block until watcher is stopped. - - Use Ctrl+C or call stop() from another thread to unblock. - """ - try: - while self._running: - self._stop_event.wait(timeout=1.0) - except KeyboardInterrupt: - logger.info("Received interrupt, stopping watcher...") - self.stop() - - @property - def is_running(self) -> bool: - """Check if watcher is currently running.""" - return self._running diff --git a/codex-lens/src/codexlens/watcher/incremental_indexer.py b/codex-lens/src/codexlens/watcher/incremental_indexer.py deleted file mode 100644 index 39888115..00000000 --- a/codex-lens/src/codexlens/watcher/incremental_indexer.py +++ /dev/null @@ -1,423 +0,0 @@ -"""Incremental indexer for processing file changes.""" - -from __future__ import annotations - -import logging -from dataclasses import dataclass -from pathlib import Path -from typing import List, Optional - -from codexlens.config import Config -from codexlens.parsers.factory import ParserFactory -from codexlens.storage.dir_index import DirIndexStore -from codexlens.storage.global_index import GlobalSymbolIndex -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - -from .events import ChangeType, FileEvent, IndexResult - -logger = logging.getLogger(__name__) - - -@dataclass -class FileIndexResult: - """Result of indexing a single file.""" - path: Path - symbols_count: int - success: bool - error: Optional[str] = None - - -class IncrementalIndexer: - """Incremental indexer for processing file change events. - - Processes file events (create, modify, delete, move) and updates - the corresponding index databases incrementally. - - Reuses existing infrastructure: - - ParserFactory for symbol extraction - - DirIndexStore for per-directory storage - - GlobalSymbolIndex for cross-file symbols - - PathMapper for source-to-index path conversion - - Example: - indexer = IncrementalIndexer(registry, mapper, config) - result = indexer.process_changes([ - FileEvent(Path("foo.py"), ChangeType.MODIFIED, time.time()), - ]) - print(f"Indexed {result.files_indexed} files") - """ - - def __init__( - self, - registry: RegistryStore, - mapper: PathMapper, - config: Optional[Config] = None, - ) -> None: - """Initialize incremental indexer. - - Args: - registry: Global project registry - mapper: Path mapper for source-to-index conversion - config: CodexLens configuration (uses defaults if None) - """ - self.registry = registry - self.mapper = mapper - self.config = config or Config() - self.parser_factory = ParserFactory(self.config) - - self._global_index: Optional[GlobalSymbolIndex] = None - self._dir_stores: dict[Path, DirIndexStore] = {} - self._lock = __import__("threading").RLock() - - def _get_global_index(self, index_root: Path, source_root: Optional[Path] = None) -> Optional[GlobalSymbolIndex]: - """Get or create global symbol index. - - Args: - index_root: Root directory containing the global symbol index DB - source_root: Source directory root for looking up project_id from registry - """ - if not self.config.global_symbol_index_enabled: - return None - - if self._global_index is None: - global_db_path = index_root / GlobalSymbolIndex.DEFAULT_DB_NAME - if global_db_path.exists(): - # Get project_id from registry using source_root - project_id = 0 # Default fallback - if source_root: - project_info = self.registry.get_project(source_root) - if project_info: - project_id = project_info.id - try: - self._global_index = GlobalSymbolIndex(global_db_path, project_id=project_id) - # Ensure schema exists (best-effort). The DB should already be initialized - # by `codexlens index init`, but watcher/index-update should be robust. - self._global_index.initialize() - except Exception as exc: - logger.debug( - "Failed to initialize global symbol index at %s: %s", - global_db_path, - exc, - ) - self._global_index = None - - return self._global_index - - def _get_dir_store(self, dir_path: Path) -> Optional[DirIndexStore]: - """Get DirIndexStore for a directory, if indexed.""" - with self._lock: - if dir_path in self._dir_stores: - return self._dir_stores[dir_path] - - index_db = self.mapper.source_to_index_db(dir_path) - if not index_db.exists(): - logger.debug("No index found for directory: %s", dir_path) - return None - - # Get index root for global index - source_root = self.mapper.get_project_root(dir_path) or dir_path - index_root = self.mapper.source_to_index_dir(source_root) - global_index = self._get_global_index(index_root, source_root=source_root) - - store = DirIndexStore( - index_db, - config=self.config, - global_index=global_index, - ) - self._dir_stores[dir_path] = store - return store - - def process_changes(self, events: List[FileEvent]) -> IndexResult: - """Process a batch of file change events. - - Args: - events: List of file events to process - - Returns: - IndexResult with statistics - """ - result = IndexResult() - - for event in events: - try: - if event.change_type == ChangeType.CREATED: - file_result = self._index_file(event.path) - if file_result.success: - result.files_indexed += 1 - result.symbols_added += file_result.symbols_count - else: - result.errors.append(file_result.error or f"Failed to index: {event.path}") - - elif event.change_type == ChangeType.MODIFIED: - file_result = self._index_file(event.path) - if file_result.success: - result.files_indexed += 1 - result.symbols_added += file_result.symbols_count - else: - result.errors.append(file_result.error or f"Failed to index: {event.path}") - - elif event.change_type == ChangeType.DELETED: - self._remove_file(event.path) - result.files_removed += 1 - - elif event.change_type == ChangeType.MOVED: - # Remove from old location, add at new location - if event.old_path: - self._remove_file(event.old_path) - result.files_removed += 1 - file_result = self._index_file(event.path) - if file_result.success: - result.files_indexed += 1 - result.symbols_added += file_result.symbols_count - else: - result.errors.append(file_result.error or f"Failed to index: {event.path}") - - except Exception as exc: - error_msg = f"Error processing {event.path}: {type(exc).__name__}: {exc}" - logger.error(error_msg) - result.errors.append(error_msg) - - return result - - def _index_file(self, path: Path) -> FileIndexResult: - """Index a single file. - - Args: - path: Path to the file to index - - Returns: - FileIndexResult with status - """ - path = Path(path).resolve() - - # Check if file exists - if not path.exists(): - return FileIndexResult( - path=path, - symbols_count=0, - success=False, - error=f"File not found: {path}", - ) - - # Check if language is supported - language = self.config.language_for_path(path) - if not language: - return FileIndexResult( - path=path, - symbols_count=0, - success=False, - error=f"Unsupported language for: {path}", - ) - - # Get directory store - dir_path = path.parent - store = self._get_dir_store(dir_path) - if store is None: - return FileIndexResult( - path=path, - symbols_count=0, - success=False, - error=f"Directory not indexed: {dir_path}", - ) - - # Read file content with fallback encodings - try: - content = path.read_text(encoding="utf-8") - except UnicodeDecodeError: - logger.debug("UTF-8 decode failed for %s, using fallback with errors='ignore'", path) - try: - content = path.read_text(encoding="utf-8", errors="ignore") - except Exception as exc: - return FileIndexResult( - path=path, - symbols_count=0, - success=False, - error=f"Failed to read file: {exc}", - ) - except Exception as exc: - return FileIndexResult( - path=path, - symbols_count=0, - success=False, - error=f"Failed to read file: {exc}", - ) - - # Parse symbols - try: - parser = self.parser_factory.get_parser(language) - indexed_file = parser.parse(content, path) - except Exception as exc: - error_msg = f"Failed to parse {path}: {type(exc).__name__}: {exc}" - logger.error(error_msg) - return FileIndexResult( - path=path, - symbols_count=0, - success=False, - error=error_msg, - ) - - # Update store with retry logic for transient database errors - max_retries = 3 - for attempt in range(max_retries): - try: - store.add_file( - name=path.name, - full_path=str(path), - content=content, - language=language, - symbols=indexed_file.symbols, - relationships=indexed_file.relationships, - ) - - # Update merkle root - store.update_merkle_root() - - # Update global relationships for static graph expansion (best-effort). - if getattr(self.config, "static_graph_enabled", False): - try: - source_root = self.mapper.get_project_root(path) or dir_path - index_root = self.mapper.source_to_index_dir(source_root) - global_index = self._get_global_index(index_root, source_root=source_root) - if global_index is not None: - allowed_types = set( - getattr( - self.config, - "static_graph_relationship_types", - ["imports", "inherits"], - ) - or [] - ) - filtered_rels = [ - r - for r in (indexed_file.relationships or []) - if r.relationship_type.value in allowed_types - ] - global_index.update_file_relationships(path, filtered_rels) - except Exception as exc: - logger.debug( - "Failed to update global relationships for %s: %s", - path, - exc, - ) - - logger.debug("Indexed file: %s (%d symbols)", path, len(indexed_file.symbols)) - - return FileIndexResult( - path=path, - symbols_count=len(indexed_file.symbols), - success=True, - ) - - except __import__("sqlite3").OperationalError as exc: - # Transient database errors (e.g., database locked) - if attempt < max_retries - 1: - import time - wait_time = 0.1 * (2 ** attempt) # Exponential backoff - logger.debug("Database operation failed (attempt %d/%d), retrying in %.2fs: %s", - attempt + 1, max_retries, wait_time, exc) - time.sleep(wait_time) - continue - else: - error_msg = f"Failed to store {path} after {max_retries} attempts: {exc}" - logger.error(error_msg) - return FileIndexResult( - path=path, - symbols_count=0, - success=False, - error=error_msg, - ) - except Exception as exc: - error_msg = f"Failed to store {path}: {type(exc).__name__}: {exc}" - logger.error(error_msg) - return FileIndexResult( - path=path, - symbols_count=0, - success=False, - error=error_msg, - ) - - # Should never reach here - return FileIndexResult( - path=path, - symbols_count=0, - success=False, - error="Unexpected error in indexing loop", - ) - - def _remove_file(self, path: Path) -> bool: - """Remove a file from the index. - - Args: - path: Path to the file to remove - - Returns: - True if removed successfully - """ - path = Path(path).resolve() - dir_path = path.parent - - store = self._get_dir_store(dir_path) - if store is None: - logger.debug("Cannot remove file, directory not indexed: %s", dir_path) - return False - - # Retry logic for transient database errors - max_retries = 3 - for attempt in range(max_retries): - try: - store.remove_file(str(path)) - store.update_merkle_root() - - # Best-effort cleanup of static graph relationships (keeps global DB consistent). - if getattr(self.config, "static_graph_enabled", False): - try: - source_root = self.mapper.get_project_root(path) or dir_path - index_root = self.mapper.source_to_index_dir(source_root) - global_index = self._get_global_index(index_root, source_root=source_root) - if global_index is not None: - global_index.delete_file_relationships(path) - except Exception as exc: - logger.debug( - "Failed to delete global relationships for %s: %s", - path, - exc, - ) - logger.debug("Removed file from index: %s", path) - return True - - except __import__("sqlite3").OperationalError as exc: - # Transient database errors (e.g., database locked) - if attempt < max_retries - 1: - import time - wait_time = 0.1 * (2 ** attempt) # Exponential backoff - logger.debug("Database operation failed (attempt %d/%d), retrying in %.2fs: %s", - attempt + 1, max_retries, wait_time, exc) - time.sleep(wait_time) - continue - else: - logger.error("Failed to remove %s after %d attempts: %s", path, max_retries, exc) - return False - except Exception as exc: - logger.error("Failed to remove %s: %s", path, exc) - return False - - # Should never reach here - return False - - def close(self) -> None: - """Close all open stores.""" - with self._lock: - for store in self._dir_stores.values(): - try: - store.close() - except Exception: - pass - self._dir_stores.clear() - - if self._global_index: - try: - self._global_index.close() - except Exception: - pass - self._global_index = None diff --git a/codex-lens/src/codexlens/watcher/manager.py b/codex-lens/src/codexlens/watcher/manager.py deleted file mode 100644 index 5a5653d4..00000000 --- a/codex-lens/src/codexlens/watcher/manager.py +++ /dev/null @@ -1,255 +0,0 @@ -"""Watcher manager for coordinating file watching and incremental indexing.""" - -from __future__ import annotations - -import json -import logging -import signal -import threading -import time -from pathlib import Path -from typing import Callable, List, Optional - -from codexlens.config import Config -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - -from .events import FileEvent, IndexResult, PendingQueueStatus, WatcherConfig, WatcherStats -from .file_watcher import FileWatcher -from .incremental_indexer import IncrementalIndexer - -logger = logging.getLogger(__name__) - - -class WatcherManager: - """High-level manager for file watching and incremental indexing. - - Coordinates FileWatcher and IncrementalIndexer with: - - Lifecycle management (start/stop) - - Signal handling (SIGINT/SIGTERM) - - Statistics tracking - - Graceful shutdown - """ - - def __init__( - self, - root_path: Path, - config: Optional[Config] = None, - watcher_config: Optional[WatcherConfig] = None, - on_indexed: Optional[Callable[[IndexResult], None]] = None, - on_queue_change: Optional[Callable[[PendingQueueStatus], None]] = None, - ) -> None: - self.root_path = Path(root_path).resolve() - self.config = config or Config() - self.watcher_config = watcher_config or WatcherConfig() - self.on_indexed = on_indexed - self.on_queue_change = on_queue_change - - self._registry: Optional[RegistryStore] = None - self._mapper: Optional[PathMapper] = None - self._watcher: Optional[FileWatcher] = None - self._indexer: Optional[IncrementalIndexer] = None - - self._running = False - self._stop_event = threading.Event() - self._lock = threading.RLock() - - # Statistics - self._stats = WatcherStats() - self._original_sigint = None - self._original_sigterm = None - - # Index history for tracking recent results - self._index_history: List[IndexResult] = [] - self._max_history_size = 10 - - def _handle_changes(self, events: List[FileEvent]) -> None: - """Handle file change events from watcher.""" - if not self._indexer or not events: - return - - logger.info("Processing %d file changes", len(events)) - result = self._indexer.process_changes(events) - - # Update stats - self._stats.events_processed += len(events) - self._stats.last_event_time = time.time() - - # Save to history - self._index_history.append(result) - if len(self._index_history) > self._max_history_size: - self._index_history.pop(0) - - if result.files_indexed > 0 or result.files_removed > 0: - logger.info( - "Indexed %d files, removed %d files, %d errors", - result.files_indexed, result.files_removed, len(result.errors) - ) - - # Output JSON for TypeScript backend parsing - result_data = { - "files_indexed": result.files_indexed, - "files_removed": result.files_removed, - "symbols_added": result.symbols_added, - "symbols_removed": result.symbols_removed, - "files_success": result.files_success[:20], # Limit output - "files_failed": result.files_failed[:20], - "errors": result.errors[:10], - "timestamp": result.timestamp - } - print(f"[INDEX_RESULT] {json.dumps(result_data)}", flush=True) - - if self.on_indexed: - try: - self.on_indexed(result) - except Exception as exc: - logger.error("Error in on_indexed callback: %s", exc) - - def _signal_handler(self, signum, frame) -> None: - """Handle shutdown signals.""" - logger.info("Received signal %d, stopping...", signum) - self.stop() - - def _install_signal_handlers(self) -> None: - """Install signal handlers for graceful shutdown.""" - try: - self._original_sigint = signal.signal(signal.SIGINT, self._signal_handler) - if hasattr(signal, 'SIGTERM'): - self._original_sigterm = signal.signal(signal.SIGTERM, self._signal_handler) - except (ValueError, OSError): - # Signal handling not available (e.g., not main thread) - pass - - def _restore_signal_handlers(self) -> None: - """Restore original signal handlers.""" - try: - if self._original_sigint is not None: - signal.signal(signal.SIGINT, self._original_sigint) - if self._original_sigterm is not None and hasattr(signal, 'SIGTERM'): - signal.signal(signal.SIGTERM, self._original_sigterm) - except (ValueError, OSError): - pass - - def start(self) -> None: - """Start watching and indexing.""" - with self._lock: - if self._running: - logger.warning("WatcherManager already running") - return - - # Validate path - if not self.root_path.exists(): - raise ValueError(f"Root path does not exist: {self.root_path}") - - # Initialize components - self._registry = RegistryStore() - self._registry.initialize() - self._mapper = PathMapper() - - self._indexer = IncrementalIndexer( - self._registry, self._mapper, self.config - ) - - self._watcher = FileWatcher( - self.root_path, self.watcher_config, self._handle_changes - ) - - # Always register queue change callback for stdout output (TypeScript backend) - # The wrapper prints [QUEUE_STATUS] JSON and optionally calls on_queue_change - self._watcher.register_queue_change_callback(self._on_queue_change_wrapper) - - # Install signal handlers - self._install_signal_handlers() - - # Start watcher - self._running = True - self._stats.is_running = True - self._stop_event.clear() - self._watcher.start() - - logger.info("WatcherManager started for: %s", self.root_path) - - def stop(self) -> None: - """Stop watching and clean up.""" - with self._lock: - if not self._running: - return - - self._running = False - self._stats.is_running = False - self._stop_event.set() - - # Stop watcher - if self._watcher: - self._watcher.stop() - self._watcher = None - - # Close indexer - if self._indexer: - self._indexer.close() - self._indexer = None - - # Close registry - if self._registry: - self._registry.close() - self._registry = None - - # Restore signal handlers - self._restore_signal_handlers() - - logger.info("WatcherManager stopped") - - def wait(self) -> None: - """Block until stopped.""" - try: - while self._running: - self._stop_event.wait(timeout=1.0) - except KeyboardInterrupt: - logger.info("Interrupted, stopping...") - self.stop() - - @property - def is_running(self) -> bool: - """Check if manager is running.""" - return self._running - - def get_stats(self) -> WatcherStats: - """Get runtime statistics.""" - return WatcherStats( - files_watched=self._stats.files_watched, - events_processed=self._stats.events_processed, - last_event_time=self._stats.last_event_time, - is_running=self._running, - ) - - def _on_queue_change_wrapper(self, status: PendingQueueStatus) -> None: - """Wrapper for queue change callback with JSON output.""" - # Output JSON for TypeScript backend parsing - status_data = { - "file_count": status.file_count, - "files": status.files, - "countdown_seconds": status.countdown_seconds, - "last_event_time": status.last_event_time - } - print(f"[QUEUE_STATUS] {json.dumps(status_data)}", flush=True) - - if self.on_queue_change: - try: - self.on_queue_change(status) - except Exception as exc: - logger.error("Error in on_queue_change callback: %s", exc) - - def flush_now(self) -> None: - """Immediately flush pending queue (manual trigger).""" - if self._watcher: - self._watcher.flush_now() - - def get_pending_queue_status(self) -> Optional[PendingQueueStatus]: - """Get current pending queue status.""" - if self._watcher: - return self._watcher.get_pending_queue_status() - return None - - def get_index_history(self, limit: int = 5) -> List[IndexResult]: - """Get recent index history.""" - return self._index_history[-limit:] diff --git a/codex-lens/test_chain_search.py b/codex-lens/test_chain_search.py deleted file mode 100644 index d2ed55c0..00000000 --- a/codex-lens/test_chain_search.py +++ /dev/null @@ -1,146 +0,0 @@ -"""Test script for chain search engine functionality.""" - -from pathlib import Path -from codexlens.search import ChainSearchEngine, SearchOptions, quick_search -from codexlens.storage.registry import RegistryStore -from codexlens.storage.path_mapper import PathMapper - - -def test_basic_search(): - """Test basic chain search functionality.""" - print("=== Testing Chain Search Engine ===\n") - - # Initialize components - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - - # Create engine - engine = ChainSearchEngine(registry, mapper) - print(f"[OK] ChainSearchEngine initialized") - - # Test search options - options = SearchOptions( - depth=-1, - max_workers=4, - limit_per_dir=10, - total_limit=50, - include_symbols=False, - files_only=False - ) - print(f"[OK] SearchOptions configured: depth={options.depth}, workers={options.max_workers}") - - # Test path that exists in the current project - test_path = Path("D:/Claude_dms3/codex-lens/src/codexlens") - - if test_path.exists(): - print(f"\n[OK] Test path exists: {test_path}") - - # Perform search - result = engine.search("search", test_path, options) - - print(f"\n=== Search Results ===") - print(f"Query: '{result.query}'") - print(f"Directories searched: {result.stats.dirs_searched}") - print(f"Files matched: {result.stats.files_matched}") - print(f"Time: {result.stats.time_ms:.2f}ms") - - if result.stats.errors: - print(f"Errors: {len(result.stats.errors)}") - for err in result.stats.errors[:3]: - print(f" - {err}") - - print(f"\nTop Results (showing first 5):") - for i, res in enumerate(result.results[:5], 1): - print(f"{i}. {res.path}") - print(f" Score: {res.score:.2f}") - if res.excerpt: - excerpt = res.excerpt.replace('\n', ' ')[:100] - print(f" Excerpt: {excerpt}...") - else: - print(f"\n[SKIP] Test path does not exist: {test_path}") - print(" (Index may not be built yet)") - - registry.close() - print("\n[OK] Test completed") - - -def test_quick_search(): - """Test quick_search convenience function.""" - print("\n\n=== Testing Quick Search ===\n") - - test_path = Path("D:/Claude_dms3/codex-lens/src") - - if test_path.exists(): - results = quick_search("index", test_path, depth=2) - print(f"[OK] Quick search completed") - print(f" Found {len(results)} results") - if results: - print(f" Top result: {results[0].path}") - else: - print(f"[SKIP] Test path does not exist: {test_path}") - - print("\n[OK] Quick search test completed") - - -def test_symbol_search(): - """Test symbol search functionality.""" - print("\n\n=== Testing Symbol Search ===\n") - - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - engine = ChainSearchEngine(registry, mapper) - - test_path = Path("D:/Claude_dms3/codex-lens/src/codexlens") - - if test_path.exists(): - symbols = engine.search_symbols("search", test_path, kind=None) - print(f"[OK] Symbol search completed") - print(f" Found {len(symbols)} symbols") - for i, sym in enumerate(symbols[:5], 1): - print(f" {i}. {sym.name} ({sym.kind}) - lines {sym.range[0]}-{sym.range[1]}") - else: - print(f"[SKIP] Test path does not exist: {test_path}") - - registry.close() - print("\n[OK] Symbol search test completed") - - -def test_files_only_search(): - """Test files-only search mode.""" - print("\n\n=== Testing Files-Only Search ===\n") - - registry = RegistryStore() - registry.initialize() - mapper = PathMapper() - engine = ChainSearchEngine(registry, mapper) - - test_path = Path("D:/Claude_dms3/codex-lens/src") - - if test_path.exists(): - file_paths = engine.search_files_only("class", test_path) - print(f"[OK] Files-only search completed") - print(f" Found {len(file_paths)} files") - for i, path in enumerate(file_paths[:5], 1): - print(f" {i}. {path}") - else: - print(f"[SKIP] Test path does not exist: {test_path}") - - registry.close() - print("\n[OK] Files-only search test completed") - - -if __name__ == "__main__": - try: - test_basic_search() - test_quick_search() - test_symbol_search() - test_files_only_search() - print("\n" + "=" * 50) - print("All tests completed successfully!") - print("=" * 50) - except Exception as e: - print(f"\n[ERROR] Test failed with error: {e}") - import traceback - traceback.print_exc() diff --git a/codex-lens/test_simple_function.py b/codex-lens/test_simple_function.py deleted file mode 100644 index 19fbf4ab..00000000 --- a/codex-lens/test_simple_function.py +++ /dev/null @@ -1,19 +0,0 @@ -"""Simple test file with clear function definitions.""" - -def hello_world(): - """A simple function.""" - return "Hello, World!" - -def greet(name: str) -> str: - """Greet someone by name.""" - return f"Hello, {name}!" - -def main(): - """Main function that calls other functions.""" - msg = hello_world() - greeting = greet("Alice") - print(msg) - print(greeting) - -if __name__ == "__main__": - main() diff --git a/codex-lens/tests/TEST_SUITE_SUMMARY.md b/codex-lens/tests/TEST_SUITE_SUMMARY.md deleted file mode 100644 index 889372b2..00000000 --- a/codex-lens/tests/TEST_SUITE_SUMMARY.md +++ /dev/null @@ -1,347 +0,0 @@ -# Hybrid Search Test Suite Summary - -## Overview - -Comprehensive test suite for hybrid search components covering Dual-FTS schema, encoding detection, incremental indexing, RRF fusion, query parsing, and end-to-end workflows. - -## Test Coverage - -### ✅ test_rrf_fusion.py (29 tests - 100% passing) -**Module Tested**: `codexlens.search.ranking` - -**Coverage**: -- ✅ Reciprocal Rank Fusion algorithm (9 tests) - - Single/multiple source ranking - - RRF score calculation with custom k values - - Weight handling and normalization - - Fusion score metadata storage -- ✅ Synthetic ranking scenarios (4 tests) - - Perfect agreement between sources - - Complete disagreement handling - - Partial overlap fusion - - Three-source fusion (exact, fuzzy, vector) -- ✅ BM25 score normalization (4 tests) - - Negative score handling - - 0-1 range normalization - - Better match = higher score validation -- ✅ Search source tagging (4 tests) - - Metadata preservation - - Source tracking for RRF -- ✅ Parameterized k-value tests (3 tests) -- ✅ Edge cases (5 tests) - - Duplicate paths - - Large result lists (1000 items) - - Missing weights handling - -**Key Test Examples**: -```python -def test_two_sources_fusion(): - """Test RRF combines rankings from two sources.""" - exact_results = [SearchResult(path="a.py", score=10.0, ...)] - fuzzy_results = [SearchResult(path="b.py", score=9.0, ...)] - fused = reciprocal_rank_fusion({"exact": exact, "fuzzy": fuzzy}) - # Items in both sources rank highest -``` - ---- - -### ✅ test_query_parser.py (47 tests - 100% passing) -**Module Tested**: `codexlens.search.query_parser` - -**Coverage**: -- ✅ CamelCase splitting (4 tests) - - `UserAuth` → `UserAuth OR User OR Auth` - - lowerCamelCase handling - - ALL_CAPS acronym preservation -- ✅ snake_case splitting (3 tests) - - `get_user_data` → `get_user_data OR get OR user OR data` -- ✅ kebab-case splitting (2 tests) -- ✅ Query expansion logic (5 tests) - - OR operator insertion - - Original query preservation - - Token deduplication - - min_token_length filtering -- ✅ FTS5 operator preservation (7 tests) - - Quoted phrases not expanded - - OR/AND/NOT/NEAR operators preserved - - Wildcard queries (`auth*`) preserved -- ✅ Multi-word queries (2 tests) -- ✅ Parameterized splitting (5 tests covering all formats) -- ✅ Edge cases (6 tests) - - Unicode identifiers - - Very long identifiers - - Mixed case styles -- ✅ Token extraction internals (4 tests) -- ✅ Integration tests (2 tests) - - Real-world query examples - - Performance (1000 queries) -- ✅ Min token length configuration (3 tests) - -**Key Test Examples**: -```python -@pytest.mark.parametrize("query,expected_tokens", [ - ("UserAuth", ["UserAuth", "User", "Auth"]), - ("get_user_data", ["get_user_data", "get", "user", "data"]), -]) -def test_identifier_splitting(query, expected_tokens): - parser = QueryParser() - result = parser.preprocess_query(query) - for token in expected_tokens: - assert token in result -``` - ---- - -### ⚠️ test_encoding.py (34 tests - 24 passing, 7 failing, 3 skipped) -**Module Tested**: `codexlens.parsers.encoding` - -**Passing Coverage**: -- ✅ Encoding availability detection (2 tests) -- ✅ Basic encoding detection (3 tests) -- ✅ read_file_safe functionality (9 tests) - - UTF-8, GBK, Latin-1 file reading - - Error replacement with `errors='replace'` - - Empty files, nonexistent files, directories -- ✅ Binary file detection (7 tests) - - Null byte detection - - Non-text character ratio - - Sample size parameter -- ✅ Parameterized encoding tests (4 tests) - - UTF-8, GBK, ISO-8859-1, Windows-1252 - -**Known Issues** (7 failing tests): -- Chardet-specific tests failing due to mock/patch issues -- Tests expect exact encoding detection behavior -- **Resolution**: Tests work correctly when chardet is available, mock issues are minor - ---- - -### ⚠️ test_dual_fts.py (17 tests - needs API fixes) -**Module Tested**: `codexlens.storage.dir_index` (Dual-FTS schema) - -**Test Structure**: -- 🔧 Dual FTS schema creation (4 tests) - - `files_fts_exact` and `files_fts_fuzzy` table existence - - Tokenizer validation (unicode61 for exact, trigram for fuzzy) -- 🔧 Trigger synchronization (3 tests) - - INSERT/UPDATE/DELETE triggers - - Content sync between tables -- 🔧 Migration tests (4 tests) - - v2 → v4 migration - - Data preservation - - Schema version updates - - Idempotency -- 🔧 Trigram availability (1 test) - - Fallback to unicode61 when trigram unavailable -- 🔧 Performance benchmarks (2 tests) - - INSERT overhead measurement - - Search performance on exact/fuzzy FTS - -**Required Fix**: Replace `_connect()` with `_get_connection()` to match DirIndexStore API - ---- - -### ⚠️ test_incremental_indexing.py (14 tests - needs API fixes) -**Module Tested**: `codexlens.storage.dir_index` (mtime tracking) - -**Test Structure**: -- 🔧 Mtime tracking (4 tests) - - needs_reindex() logic for new/unchanged/modified files - - mtime column validation -- 🔧 Incremental update workflows (3 tests) - - ≥90% skip rate verification - - Modified file detection - - New file detection -- 🔧 Deleted file cleanup (2 tests) - - Nonexistent file removal - - Existing file preservation -- 🔧 Mtime edge cases (3 tests) - - Floating-point precision - - NULL mtime handling - - Future mtime (clock skew) -- 🔧 Performance benchmarks (2 tests) - - Skip rate on 1000 files - - Cleanup performance - -**Required Fix**: Same as dual_fts.py - API method name correction - ---- - -### ⚠️ test_hybrid_search_e2e.py (30 tests - needs API fixes) -**Module Tested**: `codexlens.search.hybrid_search` + full pipeline - -**Test Structure**: -- 🔧 Basic engine tests (3 tests) - - Initialization with default/custom weights - - Empty index handling -- 🔧 Sample project tests (7 tests) - - Exact/fuzzy/hybrid search modes - - Python + TypeScript project structure - - CamelCase/snake_case query expansion - - Partial identifier matching -- 🔧 Relevance ranking (3 tests) - - Exact match ranking - - Hybrid RRF fusion improvement -- 🔧 Performance tests (2 tests) - - Search latency benchmarks - - Hybrid overhead (<2x exact search) -- 🔧 Edge cases (5 tests) - - Empty index - - No matches - - Special characters - - Unicode queries - - Very long queries -- 🔧 Integration workflows (2 tests) - - Index → search → refine - - Result consistency - -**Required Fix**: API method corrections - ---- - -## Test Statistics - -| Test File | Total | Passing | Failing | Skipped | -|-----------|-------|---------|---------|---------| -| test_rrf_fusion.py | 29 | 29 | 0 | 0 | -| test_query_parser.py | 47 | 47 | 0 | 0 | -| test_encoding.py | 34 | 24 | 7 | 3 | -| test_dual_fts.py | 17 | 0* | 17* | 0 | -| test_incremental_indexing.py | 14 | 0* | 14* | 0 | -| test_hybrid_search_e2e.py | 30 | 0* | 30* | 0 | -| **TOTAL** | **171** | **100** | **68** | **3** | - -*Requires minor API fixes (method name corrections) - ---- - -## Accomplishments - -### ✅ Fully Implemented -1. **RRF Fusion Testing** (29 tests) - - Complete coverage of reciprocal rank fusion algorithm - - Synthetic ranking scenarios validation - - BM25 normalization testing - - Weight handling and edge cases - -2. **Query Parser Testing** (47 tests) - - Comprehensive identifier splitting coverage - - CamelCase, snake_case, kebab-case expansion - - FTS5 operator preservation - - Parameterized tests for all formats - - Performance and integration tests - -3. **Encoding Detection Testing** (34 tests - 24 passing) - - UTF-8, GBK, Latin-1, Windows-1252 support - - Binary file detection heuristics - - Safe file reading with error replacement - - Chardet integration tests - -### 🔧 Implemented (Needs Minor Fixes) -4. **Dual-FTS Schema Testing** (17 tests) - - Schema creation and migration - - Trigger synchronization - - Trigram tokenizer availability - - Performance benchmarks - -5. **Incremental Indexing Testing** (14 tests) - - Mtime-based change detection - - ≥90% skip rate validation - - Deleted file cleanup - - Edge case handling - -6. **Hybrid Search E2E Testing** (30 tests) - - Complete workflow testing - - Sample project structure - - Relevance ranking validation - - Performance benchmarks - ---- - -## Test Execution Examples - -### Run All Working Tests -```bash -cd codex-lens -python -m pytest tests/test_rrf_fusion.py tests/test_query_parser.py -v -``` - -### Run Encoding Tests (with optional dependencies) -```bash -pip install chardet # Optional for encoding detection -python -m pytest tests/test_encoding.py -v -``` - -### Run All Tests (including failing ones for debugging) -```bash -python -m pytest tests/test_*.py -v --tb=short -``` - -### Run with Coverage -```bash -python -m pytest tests/test_rrf_fusion.py tests/test_query_parser.py --cov=codexlens.search --cov-report=term -``` - ---- - -## Quick Fixes Required - -### Fix DirIndexStore API References -All database-related tests need one change: -- Replace: `with store._connect() as conn:` -- With: `conn = store._get_connection()` - -**Files to Fix**: -1. `test_dual_fts.py` - 17 tests -2. `test_incremental_indexing.py` - 14 tests -3. `test_hybrid_search_e2e.py` - 30 tests - -**Example Fix**: -```python -# Before (incorrect) -with index_store._connect() as conn: - conn.execute("SELECT * FROM files") - -# After (correct) -conn = index_store._get_connection() -conn.execute("SELECT * FROM files") -``` - ---- - -## Coverage Goals Achieved - -✅ **50+ test cases** across all components (171 total) -✅ **90%+ code coverage** on new modules (RRF, query parser) -✅ **Integration tests** verify end-to-end workflows -✅ **Performance benchmarks** measure latency and overhead -✅ **Parameterized tests** cover multiple input variations -✅ **Edge case handling** for Unicode, special chars, empty inputs - ---- - -## Next Steps - -1. **Apply API fixes** to database tests (est. 15 min) -2. **Run full test suite** with `pytest --cov` -3. **Verify ≥90% coverage** on hybrid search modules -4. **Document any optional dependencies** (chardet for encoding) -5. **Add pytest markers** for benchmark tests - ---- - -## Test Quality Features - -- ✅ **Fixture-based setup** for database isolation -- ✅ **Temporary files** prevent test pollution -- ✅ **Parameterized tests** reduce duplication -- ✅ **Benchmark markers** for performance tests -- ✅ **Skip markers** for optional dependencies -- ✅ **Clear assertions** with descriptive messages -- ✅ **Mocking** for external dependencies (chardet) - ---- - -**Generated**: 2025-12-16 -**Test Framework**: pytest 8.4.2 -**Python Version**: 3.13.5 diff --git a/codex-lens/tests/__init__.py b/codex-lens/tests/__init__.py deleted file mode 100644 index 263cbbec..00000000 --- a/codex-lens/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""CodexLens test suite.""" diff --git a/codex-lens/tests/api/test_references.py b/codex-lens/tests/api/test_references.py deleted file mode 100644 index b50bed9b..00000000 --- a/codex-lens/tests/api/test_references.py +++ /dev/null @@ -1,282 +0,0 @@ -"""Tests for codexlens.api.references module.""" - -import os -import tempfile -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest - -from codexlens.api.references import ( - find_references, - _read_line_from_file, - _proximity_score, - _group_references_by_definition, - _transform_to_reference_result, -) -from codexlens.api.models import ( - DefinitionResult, - ReferenceResult, - GroupedReferences, -) - - -class TestReadLineFromFile: - """Tests for _read_line_from_file helper.""" - - def test_read_existing_line(self, tmp_path): - """Test reading an existing line from a file.""" - test_file = tmp_path / "test.py" - test_file.write_text("line 1\nline 2\nline 3\n") - - assert _read_line_from_file(str(test_file), 1) == "line 1" - assert _read_line_from_file(str(test_file), 2) == "line 2" - assert _read_line_from_file(str(test_file), 3) == "line 3" - - def test_read_nonexistent_line(self, tmp_path): - """Test reading a line that doesn't exist.""" - test_file = tmp_path / "test.py" - test_file.write_text("line 1\nline 2\n") - - assert _read_line_from_file(str(test_file), 10) == "" - - def test_read_nonexistent_file(self): - """Test reading from a file that doesn't exist.""" - assert _read_line_from_file("/nonexistent/path/file.py", 1) == "" - - def test_strips_trailing_whitespace(self, tmp_path): - """Test that trailing whitespace is stripped.""" - test_file = tmp_path / "test.py" - test_file.write_text("line with spaces \n") - - assert _read_line_from_file(str(test_file), 1) == "line with spaces" - - -class TestProximityScore: - """Tests for _proximity_score helper.""" - - def test_same_file(self): - """Same file should return highest score.""" - score = _proximity_score("/a/b/c.py", "/a/b/c.py") - assert score == 1000 - - def test_same_directory(self): - """Same directory should return 100.""" - score = _proximity_score("/a/b/x.py", "/a/b/y.py") - assert score == 100 - - def test_different_directories(self): - """Different directories should return common prefix length.""" - score = _proximity_score("/a/b/c/x.py", "/a/b/d/y.py") - # Common path is /a/b - assert score > 0 - - def test_empty_paths(self): - """Empty paths should return 0.""" - assert _proximity_score("", "/a/b/c.py") == 0 - assert _proximity_score("/a/b/c.py", "") == 0 - assert _proximity_score("", "") == 0 - - -class TestGroupReferencesByDefinition: - """Tests for _group_references_by_definition helper.""" - - def test_single_definition(self): - """Single definition should have all references.""" - definition = DefinitionResult( - name="foo", - kind="function", - file_path="/a/b/c.py", - line=10, - end_line=20, - ) - references = [ - ReferenceResult( - file_path="/a/b/d.py", - line=5, - column=0, - context_line="foo()", - relationship="call", - ), - ReferenceResult( - file_path="/a/x/y.py", - line=10, - column=0, - context_line="foo()", - relationship="call", - ), - ] - - result = _group_references_by_definition([definition], references) - - assert len(result) == 1 - assert result[0].definition == definition - assert len(result[0].references) == 2 - - def test_multiple_definitions(self): - """Multiple definitions should group by proximity.""" - def1 = DefinitionResult( - name="foo", - kind="function", - file_path="/a/b/c.py", - line=10, - end_line=20, - ) - def2 = DefinitionResult( - name="foo", - kind="function", - file_path="/x/y/z.py", - line=10, - end_line=20, - ) - - # Reference closer to def1 - ref1 = ReferenceResult( - file_path="/a/b/d.py", - line=5, - column=0, - context_line="foo()", - relationship="call", - ) - # Reference closer to def2 - ref2 = ReferenceResult( - file_path="/x/y/w.py", - line=10, - column=0, - context_line="foo()", - relationship="call", - ) - - result = _group_references_by_definition( - [def1, def2], [ref1, ref2], include_definition=True - ) - - assert len(result) == 2 - # Each definition should have the closer reference - def1_refs = [g for g in result if g.definition == def1][0].references - def2_refs = [g for g in result if g.definition == def2][0].references - - assert any(r.file_path == "/a/b/d.py" for r in def1_refs) - assert any(r.file_path == "/x/y/w.py" for r in def2_refs) - - def test_empty_definitions(self): - """Empty definitions should return empty result.""" - result = _group_references_by_definition([], []) - assert result == [] - - -class TestTransformToReferenceResult: - """Tests for _transform_to_reference_result helper.""" - - def test_normalizes_relationship_type(self, tmp_path): - """Test that relationship type is normalized.""" - test_file = tmp_path / "test.py" - test_file.write_text("def foo(): pass\n") - - # Create a mock raw reference - raw_ref = MagicMock() - raw_ref.file_path = str(test_file) - raw_ref.line = 1 - raw_ref.column = 0 - raw_ref.relationship_type = "calls" # Plural form - - result = _transform_to_reference_result(raw_ref) - - assert result.relationship == "call" # Normalized form - assert result.context_line == "def foo(): pass" - - -class TestFindReferences: - """Tests for find_references API function.""" - - def test_raises_for_invalid_project_root(self): - """Test that ValueError is raised for invalid project root.""" - with pytest.raises(ValueError, match="does not exist"): - find_references("/nonexistent/path", "some_symbol") - - @patch("codexlens.search.chain_search.ChainSearchEngine") - @patch("codexlens.storage.registry.RegistryStore") - @patch("codexlens.storage.path_mapper.PathMapper") - @patch("codexlens.config.Config") - def test_returns_grouped_references( - self, mock_config, mock_mapper, mock_registry, mock_engine_class, tmp_path - ): - """Test that find_references returns GroupedReferences.""" - # Setup mocks - mock_engine = MagicMock() - mock_engine_class.return_value = mock_engine - - # Mock symbol search (for definitions) - mock_symbol = MagicMock() - mock_symbol.name = "test_func" - mock_symbol.kind = "function" - mock_symbol.file = str(tmp_path / "test.py") - mock_symbol.range = (10, 20) - mock_engine.search_symbols.return_value = [mock_symbol] - - # Mock reference search - mock_ref = MagicMock() - mock_ref.file_path = str(tmp_path / "caller.py") - mock_ref.line = 5 - mock_ref.column = 0 - mock_ref.relationship_type = "call" - mock_engine.search_references.return_value = [mock_ref] - - # Create test files - test_file = tmp_path / "test.py" - test_file.write_text("def test_func():\n pass\n") - caller_file = tmp_path / "caller.py" - caller_file.write_text("test_func()\n") - - # Call find_references - result = find_references(str(tmp_path), "test_func") - - # Verify result structure - assert isinstance(result, list) - assert len(result) == 1 - assert isinstance(result[0], GroupedReferences) - assert result[0].definition.name == "test_func" - assert len(result[0].references) == 1 - - @patch("codexlens.search.chain_search.ChainSearchEngine") - @patch("codexlens.storage.registry.RegistryStore") - @patch("codexlens.storage.path_mapper.PathMapper") - @patch("codexlens.config.Config") - def test_respects_include_definition_false( - self, mock_config, mock_mapper, mock_registry, mock_engine_class, tmp_path - ): - """Test include_definition=False behavior.""" - mock_engine = MagicMock() - mock_engine_class.return_value = mock_engine - mock_engine.search_symbols.return_value = [] - mock_engine.search_references.return_value = [] - - result = find_references( - str(tmp_path), "test_func", include_definition=False - ) - - # Should still return a result with placeholder definition - assert len(result) == 1 - assert result[0].definition.name == "test_func" - - -class TestImports: - """Tests for module imports and exports.""" - - def test_find_references_exported_from_api(self): - """Test that find_references is exported from codexlens.api.""" - from codexlens.api import find_references as api_find_references - - assert callable(api_find_references) - - def test_models_exported_from_api(self): - """Test that result models are exported from codexlens.api.""" - from codexlens.api import ( - GroupedReferences, - ReferenceResult, - DefinitionResult, - ) - - assert GroupedReferences is not None - assert ReferenceResult is not None - assert DefinitionResult is not None diff --git a/codex-lens/tests/api/test_semantic_integration.py b/codex-lens/tests/api/test_semantic_integration.py deleted file mode 100644 index 3c54f82b..00000000 --- a/codex-lens/tests/api/test_semantic_integration.py +++ /dev/null @@ -1,264 +0,0 @@ -"""Integration tests for semantic.py API - fusion strategy routing and result transform. - -Tests cover: -- _execute_search: Strategy routing for rrf, binary, staged, hybrid (compat), dense_rerank -- _transform_results: Score extraction and kind filtering -""" - -from __future__ import annotations - -from pathlib import Path -from typing import List, Optional -from unittest.mock import MagicMock, Mock, patch - -import pytest - -from codexlens.api.models import SemanticResult -from codexlens.api.semantic import _execute_search, _transform_results -from codexlens.entities import SearchResult -from codexlens.search.chain_search import ( - ChainSearchEngine, - ChainSearchResult, - SearchOptions, - SearchStats, -) - - -# ============================================================================= -# Test Fixtures -# ============================================================================= - - -@pytest.fixture -def mock_engine(): - """Create mock ChainSearchEngine.""" - engine = MagicMock(spec=ChainSearchEngine) - return engine - - -@pytest.fixture -def mock_chain_result(): - """Create mock ChainSearchResult with sample data.""" - return ChainSearchResult( - query="test query", - results=[ - SearchResult( - path="auth.py", - score=0.9, - excerpt="def authenticate(user):", - symbol_name="authenticate", - symbol_kind="function", - start_line=10, - end_line=20, - ), - SearchResult( - path="login.py", - score=0.7, - excerpt="class LoginHandler:", - symbol_name="LoginHandler", - symbol_kind="class", - start_line=5, - end_line=50, - ), - ], - symbols=[], - stats=SearchStats(), - ) - - -@pytest.fixture -def mock_options(): - """Create mock SearchOptions.""" - return SearchOptions( - hybrid_mode=True, - enable_vector=True, - enable_fuzzy=True, - ) - - -# ============================================================================= -# Tests: _execute_search strategy routing -# ============================================================================= - - -class TestExecuteSearchStrategyRouting: - """Tests for _execute_search() fusion strategy routing.""" - - def test_fusion_strategy_rrf(self, mock_engine, mock_chain_result, mock_options): - """Default 'rrf' strategy should call engine.search().""" - mock_engine.search.return_value = mock_chain_result - - result = _execute_search( - engine=mock_engine, - query="test", - source_path=Path("/project"), - fusion_strategy="rrf", - options=mock_options, - limit=20, - ) - - mock_engine.search.assert_called_once() - assert isinstance(result, ChainSearchResult) - - def test_fusion_strategy_binary(self, mock_engine, mock_chain_result, mock_options): - """'binary' strategy should call engine.binary_cascade_search().""" - mock_engine.binary_cascade_search.return_value = mock_chain_result - - result = _execute_search( - engine=mock_engine, - query="test", - source_path=Path("/project"), - fusion_strategy="binary", - options=mock_options, - limit=20, - ) - - mock_engine.binary_cascade_search.assert_called_once() - # Verify k and coarse_k parameters - call_kwargs = mock_engine.binary_cascade_search.call_args - assert call_kwargs[1]["k"] == 20 - assert call_kwargs[1]["coarse_k"] == 100 # limit * 5 - - def test_fusion_strategy_staged(self, mock_engine, mock_chain_result, mock_options): - """'staged' strategy should call engine.staged_cascade_search().""" - mock_engine.staged_cascade_search.return_value = mock_chain_result - - result = _execute_search( - engine=mock_engine, - query="test", - source_path=Path("/project"), - fusion_strategy="staged", - options=mock_options, - limit=20, - ) - - mock_engine.staged_cascade_search.assert_called_once() - - def test_fusion_strategy_hybrid_compat( - self, mock_engine, mock_chain_result, mock_options - ): - """'hybrid' strategy should map to binary_rerank_cascade_search (backward compat).""" - mock_engine.binary_rerank_cascade_search.return_value = mock_chain_result - - result = _execute_search( - engine=mock_engine, - query="test", - source_path=Path("/project"), - fusion_strategy="hybrid", - options=mock_options, - limit=20, - ) - - mock_engine.binary_rerank_cascade_search.assert_called_once() - - def test_fusion_strategy_dense_rerank( - self, mock_engine, mock_chain_result, mock_options - ): - """'dense_rerank' strategy should call engine.search() (default fallback).""" - # In the current implementation, dense_rerank is not explicitly handled, - # so it falls through to the default (rrf) branch - mock_engine.search.return_value = mock_chain_result - - result = _execute_search( - engine=mock_engine, - query="test", - source_path=Path("/project"), - fusion_strategy="dense_rerank", - options=mock_options, - limit=20, - ) - - # dense_rerank falls to default (else branch -> engine.search) - mock_engine.search.assert_called_once() - - -# ============================================================================= -# Tests: _transform_results -# ============================================================================= - - -class TestTransformResults: - """Tests for _transform_results().""" - - def test_transform_results_basic(self): - """_transform_results should convert SearchResult to SemanticResult.""" - results = [ - SearchResult( - path="auth.py", - score=0.9, - excerpt="def authenticate(user):", - symbol_name="authenticate", - symbol_kind="function", - start_line=10, - end_line=20, - ), - SearchResult( - path="models.py", - score=0.7, - excerpt="class UserModel:", - symbol_name="UserModel", - symbol_kind="class", - start_line=1, - end_line=30, - ), - ] - - semantic_results = _transform_results( - results=results, - mode="fusion", - vector_weight=0.5, - structural_weight=0.3, - keyword_weight=0.2, - kind_filter=None, - include_match_reason=False, - query="authentication", - ) - - assert len(semantic_results) == 2 - assert all(isinstance(r, SemanticResult) for r in semantic_results) - - # Check first result - first = semantic_results[0] - assert first.fusion_score == 0.9 - assert first.symbol_name == "authenticate" - assert first.kind == "function" - assert first.file_path == "auth.py" - assert first.line == 10 - - # Should be sorted by fusion_score descending - scores = [r.fusion_score for r in semantic_results] - assert scores == sorted(scores, reverse=True) - - def test_transform_results_kind_filter(self): - """_transform_results should filter by kind when kind_filter is set.""" - results = [ - SearchResult( - path="auth.py", - score=0.9, - excerpt="def auth():", - symbol_name="auth", - symbol_kind="function", - ), - SearchResult( - path="models.py", - score=0.8, - excerpt="class User:", - symbol_name="User", - symbol_kind="class", - ), - ] - - # Filter to only functions - semantic_results = _transform_results( - results=results, - mode="fusion", - vector_weight=0.5, - structural_weight=0.3, - keyword_weight=0.2, - kind_filter=["function"], - include_match_reason=False, - query="test", - ) - - assert len(semantic_results) == 1 - assert semantic_results[0].kind == "function" diff --git a/codex-lens/tests/api/test_semantic_search.py b/codex-lens/tests/api/test_semantic_search.py deleted file mode 100644 index 02720f88..00000000 --- a/codex-lens/tests/api/test_semantic_search.py +++ /dev/null @@ -1,530 +0,0 @@ -"""Tests for semantic_search API.""" -import tempfile -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest - -from codexlens.api import SemanticResult -from codexlens.api.semantic import ( - semantic_search, - _build_search_options, - _generate_match_reason, - _split_camel_case, - _transform_results, -) - - -class TestSemanticSearchFunctionSignature: - """Test that semantic_search has the correct function signature.""" - - def test_function_accepts_all_parameters(self): - """Verify function signature matches spec.""" - import inspect - sig = inspect.signature(semantic_search) - params = list(sig.parameters.keys()) - - expected_params = [ - "project_root", - "query", - "mode", - "vector_weight", - "structural_weight", - "keyword_weight", - "fusion_strategy", - "staged_stage2_mode", - "kind_filter", - "limit", - "include_match_reason", - ] - - assert params == expected_params - - def test_default_parameter_values(self): - """Verify default parameter values match spec.""" - import inspect - sig = inspect.signature(semantic_search) - - assert sig.parameters["mode"].default == "fusion" - assert sig.parameters["vector_weight"].default == 0.5 - assert sig.parameters["structural_weight"].default == 0.3 - assert sig.parameters["keyword_weight"].default == 0.2 - assert sig.parameters["fusion_strategy"].default == "rrf" - assert sig.parameters["staged_stage2_mode"].default is None - assert sig.parameters["kind_filter"].default is None - assert sig.parameters["limit"].default == 20 - assert sig.parameters["include_match_reason"].default is False - - -class TestBuildSearchOptions: - """Test _build_search_options helper function.""" - - def test_vector_mode_options(self): - """Test options for pure vector mode.""" - options = _build_search_options( - mode="vector", - vector_weight=1.0, - structural_weight=0.0, - keyword_weight=0.0, - limit=20, - ) - - assert options.hybrid_mode is True - assert options.enable_vector is True - assert options.pure_vector is True - assert options.enable_fuzzy is False - - def test_structural_mode_options(self): - """Test options for structural mode.""" - options = _build_search_options( - mode="structural", - vector_weight=0.0, - structural_weight=1.0, - keyword_weight=0.0, - limit=20, - ) - - assert options.hybrid_mode is True - assert options.enable_vector is False - assert options.enable_fuzzy is True - assert options.include_symbols is True - - def test_fusion_mode_options(self): - """Test options for fusion mode (default).""" - options = _build_search_options( - mode="fusion", - vector_weight=0.5, - structural_weight=0.3, - keyword_weight=0.2, - limit=20, - ) - - assert options.hybrid_mode is True - assert options.enable_vector is True # vector_weight > 0 - assert options.enable_fuzzy is True # keyword_weight > 0 - assert options.include_symbols is True # structural_weight > 0 - - -class TestTransformResults: - """Test _transform_results helper function.""" - - def test_transforms_basic_result(self): - """Test basic result transformation.""" - mock_result = MagicMock() - mock_result.path = "/project/src/auth.py" - mock_result.score = 0.85 - mock_result.excerpt = "def authenticate():" - mock_result.symbol_name = "authenticate" - mock_result.symbol_kind = "function" - mock_result.start_line = 10 - mock_result.symbol = None - mock_result.metadata = {} - - results = _transform_results( - results=[mock_result], - mode="fusion", - vector_weight=0.5, - structural_weight=0.3, - keyword_weight=0.2, - kind_filter=None, - include_match_reason=False, - query="auth", - ) - - assert len(results) == 1 - assert results[0].symbol_name == "authenticate" - assert results[0].kind == "function" - assert results[0].file_path == "/project/src/auth.py" - assert results[0].line == 10 - assert results[0].fusion_score == 0.85 - - def test_kind_filter_excludes_non_matching(self): - """Test that kind_filter excludes non-matching results.""" - mock_result = MagicMock() - mock_result.path = "/project/src/auth.py" - mock_result.score = 0.85 - mock_result.excerpt = "AUTH_TOKEN = 'secret'" - mock_result.symbol_name = "AUTH_TOKEN" - mock_result.symbol_kind = "variable" - mock_result.start_line = 5 - mock_result.symbol = None - mock_result.metadata = {} - - results = _transform_results( - results=[mock_result], - mode="fusion", - vector_weight=0.5, - structural_weight=0.3, - keyword_weight=0.2, - kind_filter=["function", "class"], # Exclude variable - include_match_reason=False, - query="auth", - ) - - assert len(results) == 0 - - def test_kind_filter_includes_matching(self): - """Test that kind_filter includes matching results.""" - mock_result = MagicMock() - mock_result.path = "/project/src/auth.py" - mock_result.score = 0.85 - mock_result.excerpt = "class AuthManager:" - mock_result.symbol_name = "AuthManager" - mock_result.symbol_kind = "class" - mock_result.start_line = 1 - mock_result.symbol = None - mock_result.metadata = {} - - results = _transform_results( - results=[mock_result], - mode="fusion", - vector_weight=0.5, - structural_weight=0.3, - keyword_weight=0.2, - kind_filter=["function", "class"], # Include class - include_match_reason=False, - query="auth", - ) - - assert len(results) == 1 - assert results[0].symbol_name == "AuthManager" - - def test_include_match_reason_generates_reason(self): - """Test that include_match_reason generates match reasons.""" - mock_result = MagicMock() - mock_result.path = "/project/src/auth.py" - mock_result.score = 0.85 - mock_result.excerpt = "def authenticate(user, password):" - mock_result.symbol_name = "authenticate" - mock_result.symbol_kind = "function" - mock_result.start_line = 10 - mock_result.symbol = None - mock_result.metadata = {} - - results = _transform_results( - results=[mock_result], - mode="fusion", - vector_weight=0.5, - structural_weight=0.3, - keyword_weight=0.2, - kind_filter=None, - include_match_reason=True, - query="authenticate", - ) - - assert len(results) == 1 - assert results[0].match_reason is not None - assert "authenticate" in results[0].match_reason.lower() - - -class TestGenerateMatchReason: - """Test _generate_match_reason helper function.""" - - def test_direct_name_match(self): - """Test match reason for direct name match.""" - reason = _generate_match_reason( - query="authenticate", - symbol_name="authenticate", - symbol_kind="function", - snippet="def authenticate(user): pass", - vector_score=0.8, - structural_score=None, - ) - - assert "authenticate" in reason.lower() - - def test_keyword_match(self): - """Test match reason for keyword match in snippet.""" - reason = _generate_match_reason( - query="password validation", - symbol_name="verify_user", - symbol_kind="function", - snippet="def verify_user(password): validate(password)", - vector_score=0.6, - structural_score=None, - ) - - assert "password" in reason.lower() or "validation" in reason.lower() - - def test_high_semantic_similarity(self): - """Test match reason mentions semantic similarity for high vector score.""" - reason = _generate_match_reason( - query="authentication", - symbol_name="login_handler", - symbol_kind="function", - snippet="def login_handler(): pass", - vector_score=0.85, - structural_score=None, - ) - - assert "semantic" in reason.lower() - - def test_returns_string_even_with_no_matches(self): - """Test that a reason string is always returned.""" - reason = _generate_match_reason( - query="xyz123", - symbol_name="abc456", - symbol_kind="function", - snippet="completely unrelated code", - vector_score=0.3, - structural_score=None, - ) - - assert isinstance(reason, str) - assert len(reason) > 0 - - -class TestSplitCamelCase: - """Test _split_camel_case helper function.""" - - def test_camel_case(self): - """Test splitting camelCase.""" - result = _split_camel_case("authenticateUser") - assert "authenticate" in result.lower() - assert "user" in result.lower() - - def test_pascal_case(self): - """Test splitting PascalCase.""" - result = _split_camel_case("AuthManager") - assert "auth" in result.lower() - assert "manager" in result.lower() - - def test_snake_case(self): - """Test splitting snake_case.""" - result = _split_camel_case("auth_manager") - assert "auth" in result.lower() - assert "manager" in result.lower() - - def test_mixed_case(self): - """Test splitting mixed case.""" - result = _split_camel_case("HTTPRequestHandler") - # Should handle acronyms - assert "http" in result.lower() or "request" in result.lower() - - -class TestSemanticResultDataclass: - """Test SemanticResult dataclass structure.""" - - def test_semantic_result_fields(self): - """Test SemanticResult has all required fields.""" - result = SemanticResult( - symbol_name="test", - kind="function", - file_path="/test.py", - line=1, - vector_score=0.8, - structural_score=0.6, - fusion_score=0.7, - snippet="def test(): pass", - match_reason="Test match", - ) - - assert result.symbol_name == "test" - assert result.kind == "function" - assert result.file_path == "/test.py" - assert result.line == 1 - assert result.vector_score == 0.8 - assert result.structural_score == 0.6 - assert result.fusion_score == 0.7 - assert result.snippet == "def test(): pass" - assert result.match_reason == "Test match" - - def test_semantic_result_optional_fields(self): - """Test SemanticResult with optional None fields.""" - result = SemanticResult( - symbol_name="test", - kind="function", - file_path="/test.py", - line=1, - vector_score=None, # Degraded - no vector index - structural_score=None, # Degraded - no relationships - fusion_score=0.5, - snippet="def test(): pass", - match_reason=None, # Not requested - ) - - assert result.vector_score is None - assert result.structural_score is None - assert result.match_reason is None - - def test_semantic_result_to_dict(self): - """Test SemanticResult.to_dict() filters None values.""" - result = SemanticResult( - symbol_name="test", - kind="function", - file_path="/test.py", - line=1, - vector_score=None, - structural_score=0.6, - fusion_score=0.7, - snippet="def test(): pass", - match_reason=None, - ) - - d = result.to_dict() - - assert "symbol_name" in d - assert "vector_score" not in d # None values filtered - assert "structural_score" in d - assert "match_reason" not in d # None values filtered - - -class TestFusionStrategyMapping: - """Test fusion_strategy parameter mapping via _execute_search.""" - - def test_rrf_strategy_calls_search(self): - """Test that rrf strategy maps to standard search.""" - from codexlens.api.semantic import _execute_search - - mock_engine = MagicMock() - mock_engine.search.return_value = MagicMock(results=[]) - mock_options = MagicMock() - - _execute_search( - engine=mock_engine, - query="test query", - source_path=Path("/test"), - fusion_strategy="rrf", - options=mock_options, - limit=20, - ) - - mock_engine.search.assert_called_once() - - def test_staged_strategy_calls_staged_cascade_search(self): - """Test that staged strategy maps to staged_cascade_search.""" - from codexlens.api.semantic import _execute_search - - mock_engine = MagicMock() - mock_engine.staged_cascade_search.return_value = MagicMock(results=[]) - mock_options = MagicMock() - - _execute_search( - engine=mock_engine, - query="test query", - source_path=Path("/test"), - fusion_strategy="staged", - options=mock_options, - limit=20, - ) - - mock_engine.staged_cascade_search.assert_called_once() - - def test_binary_strategy_calls_binary_cascade_search(self): - """Test that binary strategy maps to binary_cascade_search.""" - from codexlens.api.semantic import _execute_search - - mock_engine = MagicMock() - mock_engine.binary_cascade_search.return_value = MagicMock(results=[]) - mock_options = MagicMock() - - _execute_search( - engine=mock_engine, - query="test query", - source_path=Path("/test"), - fusion_strategy="binary", - options=mock_options, - limit=20, - ) - - mock_engine.binary_cascade_search.assert_called_once() - - def test_hybrid_strategy_maps_to_binary_rerank(self): - """Test that hybrid strategy maps to binary_rerank_cascade_search (backward compat).""" - from codexlens.api.semantic import _execute_search - - mock_engine = MagicMock() - mock_engine.binary_rerank_cascade_search.return_value = MagicMock(results=[]) - mock_options = MagicMock() - - _execute_search( - engine=mock_engine, - query="test query", - source_path=Path("/test"), - fusion_strategy="hybrid", - options=mock_options, - limit=20, - ) - - mock_engine.binary_rerank_cascade_search.assert_called_once() - - def test_unknown_strategy_defaults_to_rrf(self): - """Test that unknown strategy defaults to standard search (rrf).""" - from codexlens.api.semantic import _execute_search - - mock_engine = MagicMock() - mock_engine.search.return_value = MagicMock(results=[]) - mock_options = MagicMock() - - _execute_search( - engine=mock_engine, - query="test query", - source_path=Path("/test"), - fusion_strategy="unknown_strategy", - options=mock_options, - limit=20, - ) - - mock_engine.search.assert_called_once() - - -class TestGracefulDegradation: - """Test graceful degradation behavior.""" - - def test_vector_score_none_when_no_vector_index(self): - """Test vector_score=None when vector index unavailable.""" - mock_result = MagicMock() - mock_result.path = "/project/src/auth.py" - mock_result.score = 0.5 - mock_result.excerpt = "def auth(): pass" - mock_result.symbol_name = "auth" - mock_result.symbol_kind = "function" - mock_result.start_line = 1 - mock_result.symbol = None - mock_result.metadata = {} # No vector score in metadata - - results = _transform_results( - results=[mock_result], - mode="fusion", - vector_weight=0.5, - structural_weight=0.3, - keyword_weight=0.2, - kind_filter=None, - include_match_reason=False, - query="auth", - ) - - assert len(results) == 1 - # When no source_scores in metadata, vector_score should be None - assert results[0].vector_score is None - - def test_structural_score_extracted_from_fts(self): - """Test structural_score extracted from FTS scores.""" - mock_result = MagicMock() - mock_result.path = "/project/src/auth.py" - mock_result.score = 0.8 - mock_result.excerpt = "def auth(): pass" - mock_result.symbol_name = "auth" - mock_result.symbol_kind = "function" - mock_result.start_line = 1 - mock_result.symbol = None - mock_result.metadata = { - "source_scores": { - "exact": 0.9, - "fuzzy": 0.7, - } - } - - results = _transform_results( - results=[mock_result], - mode="fusion", - vector_weight=0.5, - structural_weight=0.3, - keyword_weight=0.2, - kind_filter=None, - include_match_reason=False, - query="auth", - ) - - assert len(results) == 1 - assert results[0].structural_score == 0.9 # max of exact/fuzzy diff --git a/codex-lens/tests/conftest.py b/codex-lens/tests/conftest.py deleted file mode 100644 index 40915fff..00000000 --- a/codex-lens/tests/conftest.py +++ /dev/null @@ -1,291 +0,0 @@ -"""Pytest configuration and shared fixtures for codex-lens tests. - -This module provides common fixtures and test utilities to reduce code duplication -across the test suite. Using fixtures ensures consistent test setup and makes tests -more maintainable. - -Common Fixtures: -- temp_dir: Temporary directory for test files -- sample_index_db: Sample index database with test data -- mock_config: Mock configuration object -- sample_code_files: Factory for creating sample code files -""" - -import sqlite3 -import shutil -import tempfile -import warnings -from pathlib import Path -from typing import Any, Dict - -import pytest - -warnings.filterwarnings( - "ignore", - message=r"'BaseCommand' is deprecated and will be removed in Click 9\.0\..*", - category=DeprecationWarning, -) -warnings.filterwarnings( - "ignore", - message=r"The '__version__' attribute is deprecated and will be removed in Click 9\.1\..*", - category=DeprecationWarning, -) - - -@pytest.fixture -def temp_dir(): - """Create a temporary directory for test files. - - The directory is automatically cleaned up after the test. - - Yields: - Path: Path to the temporary directory. - """ - temp_path = Path(tempfile.mkdtemp()) - yield temp_path - # Cleanup - if temp_path.exists(): - shutil.rmtree(temp_path) - - -@pytest.fixture -def sample_index_db(temp_dir): - """Create a sample index database with test data. - - The database has a basic schema with files and chunks tables - populated with sample data. - - Args: - temp_dir: Temporary directory fixture. - - Yields: - Path: Path to the sample index database. - """ - db_path = temp_dir / "_index.db" - - # Create database with basic schema - conn = sqlite3.connect(db_path) - cursor = conn.cursor() - - # Files table - cursor.execute(""" - CREATE TABLE files ( - id INTEGER PRIMARY KEY, - path TEXT NOT NULL UNIQUE, - content TEXT, - language TEXT, - hash TEXT, - indexed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - """) - - # Insert sample files - sample_files = [ - ("test.py", "def hello():\n print('world')", "python", "hash1"), - ("test.js", "function hello() { console.log('world'); }", "javascript", "hash2"), - ("README.md", "# Test Project", "markdown", "hash3"), - ] - cursor.executemany( - "INSERT INTO files (path, content, language, hash) VALUES (?, ?, ?, ?)", - sample_files - ) - - conn.commit() - conn.close() - - yield db_path - - -@pytest.fixture -def mock_config(): - """Create a mock configuration object with default values. - - Returns: - Mock: Mock object with common config attributes. - """ - from unittest.mock import Mock - - config = Mock() - config.index_path = Path("/tmp/test_index") - config.chunk_size = 2000 - config.overlap = 200 - config.embedding_backend = "fastembed" - config.embedding_model = "code" - config.max_results = 10 - - return config - - -@pytest.fixture -def sample_code_factory(temp_dir): - """Factory for creating sample code files. - - Args: - temp_dir: Temporary directory fixture. - - Returns: - callable: Function that creates sample code files. - """ - def _create_file(filename: str, content: str, language: str = "python") -> Path: - """Create a sample code file. - - Args: - filename: Name of the file to create. - content: Content of the file. - language: Programming language (default: python). - - Returns: - Path: Path to the created file. - """ - file_path = temp_dir / filename - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text(content) - return file_path - - return _create_file - - -@pytest.fixture -def sample_python_code(): - """Sample Python code for testing. - - Returns: - str: Sample Python code snippet. - """ - return ''' -def calculate_sum(a: int, b: int) -> int: - """Calculate the sum of two integers.""" - return a + b - -class Calculator: - """A simple calculator class.""" - - def __init__(self): - self.value = 0 - - def add(self, x: int) -> None: - """Add a value to the calculator.""" - self.value += x - -if __name__ == "__main__": - calc = Calculator() - calc.add(5) - print(f"Result: {calc.value}") -''' - - -@pytest.fixture -def sample_javascript_code(): - """Sample JavaScript code for testing. - - Returns: - str: Sample JavaScript code snippet. - """ - return ''' -// Simple utility functions -function add(a, b) { - return a + b; -} - -const Calculator = class { - constructor() { - this.value = 0; - } - - add(x) { - this.value += x; - } -}; - -// Example usage -const calc = new Calculator(); -calc.add(5); -console.log(`Result: ${calc.value}`); -''' - - -class CodeSampleFactory: - """Factory class for generating various code samples. - - This class provides methods to generate code samples in different - languages with various patterns (classes, functions, imports, etc.). - """ - - @staticmethod - def python_function(name: str = "example", docstring: bool = True) -> str: - """Generate a Python function sample. - - Args: - name: Function name. - docstring: Whether to include docstring. - - Returns: - str: Python function code. - """ - doc = f' """Example function."""\n' if docstring else '' - return f''' -def {name}(param1: str, param2: int = 10) -> str: -{doc} return param1 * param2 -'''.strip() - - @staticmethod - def python_class(name: str = "Example") -> str: - """Generate a Python class sample. - - Args: - name: Class name. - - Returns: - str: Python class code. - """ - return f''' -class {name}: - """Example class.""" - - def __init__(self, value: int = 0): - self.value = value - - def increment(self) -> None: - """Increment the value.""" - self.value += 1 -'''.strip() - - @staticmethod - def javascript_function(name: str = "example") -> str: - """Generate a JavaScript function sample. - - Args: - name: Function name. - - Returns: - str: JavaScript function code. - """ - return f'''function {name}(param1, param2 = 10) {{ - return param1 * param2; -}}'''.strip() - - @staticmethod - def typescript_interface(name: str = "Example") -> str: - """Generate a TypeScript interface sample. - - Args: - name: Interface name. - - Returns: - str: TypeScript interface code. - """ - return f'''interface {name} {{ - id: number; - name: string; - getValue(): number; -}}'''.strip() - - -@pytest.fixture -def code_sample_factory(): - """Create a code sample factory instance. - - Returns: - CodeSampleFactory: Factory for generating code samples. - """ - return CodeSampleFactory() diff --git a/codex-lens/tests/fix_sql.py b/codex-lens/tests/fix_sql.py deleted file mode 100644 index 55e66fa8..00000000 --- a/codex-lens/tests/fix_sql.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python3 -"""Fix SQL statements in test files to match new schema.""" -import re -from pathlib import Path - -def fix_insert_statement(line): - """Fix INSERT statements to provide both name and full_path.""" - # Match pattern: (test_path, test_content, "python") - # or ("test/file1.py", "content1", "python") - pattern = r'\(([^,]+),\s*([^,]+),\s*([^)]+)\)' - - def replace_values(match): - path_var, content_var, lang_var = match.groups() - # If it's a variable, we need to extract name from it - # For now, use path_var for both name and full_path - return f'({path_var}.split("/")[-1] if "/" in {path_var} else {path_var}, {path_var}, {content_var}, {lang_var}, 1234567890.0)' - - # Check if this is an INSERT VALUES line - if 'INSERT INTO files' in line and 'VALUES' in line: - # Simple string values like ("test/file1.py", "content1", "python") - if re.search(r'\("[^"]+",\s*"[^"]+",\s*"[^"]+"\)', line): - def replace_str_values(match): - parts = match.group(0)[1:-1].split('", "') - if len(parts) == 3: - path = parts[0].strip('"') - content = parts[1] - lang = parts[2].strip('"') - name = path.split('/')[-1] - return f'("{name}", "{path}", "{content}", "{lang}", 1234567890.0)' - return match.group(0) - - line = re.sub(r'\("[^"]+",\s*"[^"]+",\s*"[^"]+"\)', replace_str_values, line) - - return line - -def main(): - test_files = [ - Path("test_dual_fts.py"), - Path("test_incremental_indexing.py"), - Path("test_hybrid_search_e2e.py") - ] - - for test_file in test_files: - if not test_file.exists(): - continue - - lines = test_file.read_text(encoding='utf-8').splitlines(keepends=True) - - # Fix tuple values in execute calls - new_lines = [] - i = 0 - while i < len(lines): - line = lines[i] - - # Check if this is an execute with VALUES and tuple on next line - if 'conn.execute(' in line or 'conn.executemany(' in line: - # Look ahead for VALUES pattern - if i + 2 < len(lines) and 'VALUES' in lines[i+1]: - # Check for tuple pattern on line after VALUES - if i + 2 < len(lines) and re.search(r'^\s*\([^)]+\)\s*$', lines[i+2]): - tuple_line = lines[i+2] - # Extract values: (test_path, test_content, "python") - match = re.search(r'\(([^,]+),\s*([^,]+),\s*"([^"]+)"\)', tuple_line) - if match: - var1, var2, var3 = match.groups() - var1 = var1.strip() - var2 = var2.strip() - # Create new tuple with name extraction - indent = re.match(r'^(\s*)', tuple_line).group(1) - new_tuple = f'{indent}({var1}.split("/")[-1], {var1}, {var2}, "{var3}", 1234567890.0)\n' - new_lines.append(line) - new_lines.append(lines[i+1]) - new_lines.append(new_tuple) - i += 3 - continue - - new_lines.append(line) - i += 1 - - test_file.write_text(''.join(new_lines), encoding='utf-8') - print(f"Fixed {test_file}") - -if __name__ == "__main__": - main() diff --git a/codex-lens/tests/integration/__init__.py b/codex-lens/tests/integration/__init__.py deleted file mode 100644 index 35c99c66..00000000 --- a/codex-lens/tests/integration/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Integration tests for CodexLens.""" diff --git a/codex-lens/tests/integration/test_lsp_search_integration.py b/codex-lens/tests/integration/test_lsp_search_integration.py deleted file mode 100644 index f6b68bc0..00000000 --- a/codex-lens/tests/integration/test_lsp_search_integration.py +++ /dev/null @@ -1,583 +0,0 @@ -"""Integration tests for HybridSearchEngine LSP graph search. - -Tests the _search_lsp_graph method which orchestrates: -1. Seed retrieval via vector/exact fallback chain -2. LSP graph expansion via LspBridge and LspGraphBuilder -3. Result deduplication and merging - -Test Priority: -- P0: Critical path tests (e2e success, fallback chain) -- P1: Important edge cases (no seeds, bridge failures) -- P2: Supplementary tests (deduplication) -""" - -from __future__ import annotations - -import asyncio -import logging -import tempfile -from pathlib import Path -from typing import Any, Dict, List, Optional -from unittest.mock import AsyncMock, MagicMock, patch - -import pytest - -from codexlens.entities import SearchResult -from codexlens.hybrid_search.data_structures import ( - CallHierarchyItem, - CodeAssociationGraph, - CodeSymbolNode, - Range, -) -from codexlens.search.hybrid_search import HybridSearchEngine - - -# ----------------------------------------------------------------------------- -# Fixtures -# ----------------------------------------------------------------------------- - - -@pytest.fixture -def tmp_index_path(tmp_path: Path) -> Path: - """Create a temporary index database path.""" - db_path = tmp_path / "_index.db" - # Create empty file to satisfy existence checks - db_path.write_bytes(b"") - return db_path - - -@pytest.fixture -def sample_search_result() -> SearchResult: - """Create a sample SearchResult for use as seed.""" - return SearchResult( - path="/path/to/file.py", - content="def auth_flow(): ...", - excerpt="def auth_flow(): ...", - start_line=10, - end_line=20, - symbol_name="auth_flow", - symbol_kind="function", - score=0.9, - ) - - -@pytest.fixture -def sample_search_result_2() -> SearchResult: - """Create a second sample SearchResult.""" - return SearchResult( - path="/path/to/other.py", - content="def init_db(): ...", - excerpt="def init_db(): ...", - start_line=5, - end_line=15, - symbol_name="init_db", - symbol_kind="function", - score=0.85, - ) - - -@pytest.fixture -def sample_code_symbol_node() -> CodeSymbolNode: - """Create a sample CodeSymbolNode for graph expansion.""" - return CodeSymbolNode( - id="/path/to/related.py:helper_func:30", - name="helper_func", - kind="function", - file_path="/path/to/related.py", - range=Range( - start_line=30, - start_character=0, - end_line=40, - end_character=0, - ), - raw_code="def helper_func(): pass", - docstring="Helper function", - ) - - -@pytest.fixture -def sample_code_symbol_node_2() -> CodeSymbolNode: - """Create another sample CodeSymbolNode.""" - return CodeSymbolNode( - id="/path/to/util.py:validate:50", - name="validate", - kind="function", - file_path="/path/to/util.py", - range=Range( - start_line=50, - start_character=0, - end_line=60, - end_character=0, - ), - raw_code="def validate(): pass", - docstring="Validation function", - ) - - -@pytest.fixture -def mock_search_engine() -> HybridSearchEngine: - """Create a HybridSearchEngine with default settings.""" - return HybridSearchEngine() - - -def create_mock_graph_with_seed_and_related( - seed_result: SearchResult, - related_nodes: List[CodeSymbolNode], -) -> CodeAssociationGraph: - """Helper to create a mock graph with seed and related nodes.""" - graph = CodeAssociationGraph() - - # Add seed node - seed_node_id = f"{seed_result.path}:{seed_result.symbol_name or 'unknown'}:{seed_result.start_line or 0}" - seed_node = CodeSymbolNode( - id=seed_node_id, - name=seed_result.symbol_name or "unknown", - kind=seed_result.symbol_kind or "unknown", - file_path=seed_result.path, - range=Range( - start_line=seed_result.start_line or 1, - start_character=0, - end_line=seed_result.end_line or 1, - end_character=0, - ), - ) - graph.add_node(seed_node) - - # Add related nodes - for node in related_nodes: - graph.add_node(node) - - return graph - - -# ----------------------------------------------------------------------------- -# P0: Critical Tests -# ----------------------------------------------------------------------------- - - -class TestP0CriticalLspSearch: - """P0 Critical: Core E2E tests for LSP graph search.""" - - def test_e2e_lsp_search_vector_seed_success( - self, - tmp_index_path: Path, - sample_search_result: SearchResult, - sample_code_symbol_node: CodeSymbolNode, - sample_code_symbol_node_2: CodeSymbolNode, - ) -> None: - """Test E2E LSP search with vector providing seed, returning graph-expanded results. - - Input: query="authentication flow" - Mock: _search_vector returns 1 SearchResult as seed - Mock: LspBridge/LspGraphBuilder returns 2 related symbols - Assert: Returns 2 new results (seed is filtered from final results) - """ - engine = HybridSearchEngine() - - # Create mock graph with seed and 2 related nodes - mock_graph = create_mock_graph_with_seed_and_related( - sample_search_result, - [sample_code_symbol_node, sample_code_symbol_node_2], - ) - - # Patch seed search methods - with patch.object( - engine, "_search_vector", return_value=[sample_search_result] - ) as mock_vector, patch.object( - engine, "_search_exact", return_value=[] - ): - # Patch LSP module at the import location - with patch.dict("sys.modules", {"codexlens.lsp": MagicMock()}): - # Patch the module-level HAS_LSP check - with patch("codexlens.search.hybrid_search.HAS_LSP", True): - # Create mock LspBridge class - mock_bridge_instance = AsyncMock() - mock_bridge_class = MagicMock() - mock_bridge_class.return_value.__aenter__ = AsyncMock( - return_value=mock_bridge_instance - ) - mock_bridge_class.return_value.__aexit__ = AsyncMock( - return_value=None - ) - - # Create mock LspGraphBuilder - async def mock_build(seeds, bridge): - return mock_graph - - mock_builder_instance = MagicMock() - mock_builder_instance.build_from_seeds = mock_build - mock_builder_class = MagicMock(return_value=mock_builder_instance) - - # Patch at module level - with patch( - "codexlens.search.hybrid_search.LspBridge", - mock_bridge_class, - ), patch( - "codexlens.search.hybrid_search.LspGraphBuilder", - mock_builder_class, - ): - results = engine._search_lsp_graph( - index_path=tmp_index_path, - query="authentication flow", - limit=10, - max_depth=1, - max_nodes=20, - ) - - # Verify vector search was called first - mock_vector.assert_called_once() - - # Should return 2 results (the two non-seed nodes) - assert len(results) == 2 - - # Verify seed is not in results - seed_node_id = f"{sample_search_result.path}:{sample_search_result.symbol_name or 'unknown'}:{sample_search_result.start_line or 0}" - result_node_ids = { - f"{r.path}:{r.symbol_name or 'unknown'}:{r.start_line or 0}" - for r in results - } - assert seed_node_id not in result_node_ids - - # Verify the returned results are the graph-expanded nodes - result_paths = {r.path for r in results} - assert sample_code_symbol_node.file_path in result_paths - assert sample_code_symbol_node_2.file_path in result_paths - - def test_seed_fallback_chain_vector_fails_fts_succeeds( - self, - tmp_index_path: Path, - sample_search_result: SearchResult, - sample_code_symbol_node: CodeSymbolNode, - ) -> None: - """Test seed fallback chain: vector -> exact. - - Input: query="init_db" - Mock: _search_vector returns [] - Mock: _search_exact returns 1 seed - Assert: Fallback chain called in order, uses exact's seed - """ - engine = HybridSearchEngine() - - call_order: List[str] = [] - - def track_vector(*args, **kwargs): - call_order.append("vector") - return [] - - def track_exact(*args, **kwargs): - call_order.append("exact") - return [sample_search_result] - - # Create mock graph - mock_graph = create_mock_graph_with_seed_and_related( - sample_search_result, - [sample_code_symbol_node], - ) - - with patch.object( - engine, "_search_vector", side_effect=track_vector - ) as mock_vector, patch.object( - engine, "_search_exact", side_effect=track_exact - ) as mock_exact: - with patch("codexlens.search.hybrid_search.HAS_LSP", True): - # Create mock LspBridge class - mock_bridge_instance = AsyncMock() - mock_bridge_class = MagicMock() - mock_bridge_class.return_value.__aenter__ = AsyncMock( - return_value=mock_bridge_instance - ) - mock_bridge_class.return_value.__aexit__ = AsyncMock( - return_value=None - ) - - # Create mock LspGraphBuilder - async def mock_build(seeds, bridge): - return mock_graph - - mock_builder_instance = MagicMock() - mock_builder_instance.build_from_seeds = mock_build - mock_builder_class = MagicMock(return_value=mock_builder_instance) - - with patch( - "codexlens.search.hybrid_search.LspBridge", - mock_bridge_class, - ), patch( - "codexlens.search.hybrid_search.LspGraphBuilder", - mock_builder_class, - ): - results = engine._search_lsp_graph( - index_path=tmp_index_path, - query="init_db", - limit=10, - max_depth=1, - max_nodes=20, - ) - - # Verify fallback chain order: vector -> exact - assert call_order == ["vector", "exact"] - - # Both methods should be called - mock_vector.assert_called_once() - mock_exact.assert_called_once() - - # Should return results from graph expansion (1 related node) - assert len(results) == 1 - - -# ----------------------------------------------------------------------------- -# P1: Important Tests -# ----------------------------------------------------------------------------- - - -class TestP1ImportantLspSearch: - """P1 Important: Edge case tests for LSP graph search.""" - - def test_e2e_lsp_search_no_seeds_found( - self, - tmp_index_path: Path, - ) -> None: - """Test LSP search when no seeds found from any source. - - Input: query="non_existent_symbol" - Mock: All seed search methods return [] - Assert: Returns [], LspBridge is not called - """ - engine = HybridSearchEngine() - - with patch.object( - engine, "_search_vector", return_value=[] - ) as mock_vector, patch.object( - engine, "_search_exact", return_value=[] - ) as mock_exact: - with patch("codexlens.search.hybrid_search.HAS_LSP", True): - # LspBridge should NOT be called when no seeds - mock_bridge_class = MagicMock() - - with patch( - "codexlens.search.hybrid_search.LspBridge", - mock_bridge_class, - ): - results = engine._search_lsp_graph( - index_path=tmp_index_path, - query="non_existent_symbol", - limit=10, - max_depth=1, - max_nodes=20, - ) - - # All search methods should be tried - mock_vector.assert_called_once() - mock_exact.assert_called_once() - - # Should return empty list - assert results == [] - - # LspBridge should not be instantiated (no seeds) - mock_bridge_class.assert_not_called() - - def test_e2e_lsp_search_bridge_fails( - self, - tmp_index_path: Path, - sample_search_result: SearchResult, - caplog: pytest.LogCaptureFixture, - ) -> None: - """Test graceful degradation when LspBridge connection fails. - - Mock: Seed search returns valid seed - Mock: LspBridge raises exception during expansion - Assert: Returns [], error handled gracefully - """ - engine = HybridSearchEngine() - - with patch.object( - engine, "_search_vector", return_value=[sample_search_result] - ): - with patch("codexlens.search.hybrid_search.HAS_LSP", True): - # Make LspBridge raise an error during async context - mock_bridge_class = MagicMock() - mock_bridge_class.return_value.__aenter__ = AsyncMock( - side_effect=Exception("Connection refused") - ) - mock_bridge_class.return_value.__aexit__ = AsyncMock( - return_value=None - ) - - mock_builder_class = MagicMock() - - with patch( - "codexlens.search.hybrid_search.LspBridge", - mock_bridge_class, - ), patch( - "codexlens.search.hybrid_search.LspGraphBuilder", - mock_builder_class, - ): - with caplog.at_level(logging.DEBUG): - results = engine._search_lsp_graph( - index_path=tmp_index_path, - query="authentication", - limit=10, - max_depth=1, - max_nodes=20, - ) - - # Should return empty list on failure - assert results == [] - - -# ----------------------------------------------------------------------------- -# P2: Supplementary Tests -# ----------------------------------------------------------------------------- - - -class TestP2SupplementaryLspSearch: - """P2 Supplementary: Deduplication and edge cases.""" - - def test_result_deduping_seed_not_returned( - self, - tmp_index_path: Path, - sample_search_result: SearchResult, - ) -> None: - """Test that seed results are deduplicated from final output. - - Mock: Seed search returns SearchResult(path="a.py", symbol_name="foo") - Mock: LspBridge also returns same symbol in graph - Assert: Final results do not contain duplicate seed symbol - """ - engine = HybridSearchEngine() - - # Create a different node that should be returned - different_node = CodeSymbolNode( - id="/different/path.py:other_func:100", - name="other_func", - kind="function", - file_path="/different/path.py", - range=Range( - start_line=100, - start_character=0, - end_line=110, - end_character=0, - ), - raw_code="def other_func(): pass", - docstring="Other function", - ) - - # Create mock graph with seed and one different node - mock_graph = create_mock_graph_with_seed_and_related( - sample_search_result, - [different_node], - ) - - with patch.object( - engine, "_search_vector", return_value=[sample_search_result] - ): - with patch("codexlens.search.hybrid_search.HAS_LSP", True): - mock_bridge_instance = AsyncMock() - mock_bridge_class = MagicMock() - mock_bridge_class.return_value.__aenter__ = AsyncMock( - return_value=mock_bridge_instance - ) - mock_bridge_class.return_value.__aexit__ = AsyncMock( - return_value=None - ) - - async def mock_build(seeds, bridge): - return mock_graph - - mock_builder_instance = MagicMock() - mock_builder_instance.build_from_seeds = mock_build - mock_builder_class = MagicMock(return_value=mock_builder_instance) - - with patch( - "codexlens.search.hybrid_search.LspBridge", - mock_bridge_class, - ), patch( - "codexlens.search.hybrid_search.LspGraphBuilder", - mock_builder_class, - ): - results = engine._search_lsp_graph( - index_path=tmp_index_path, - query="test query", - limit=10, - max_depth=1, - max_nodes=20, - ) - - # Should only return 1 result (the different node, not the seed) - assert len(results) == 1 - - # The seed should NOT be in results - result_paths = [r.path for r in results] - assert sample_search_result.path not in result_paths - - # The different node should be in results - assert "/different/path.py" in result_paths - - def test_lsp_not_available_returns_empty( - self, - tmp_index_path: Path, - ) -> None: - """Test that _search_lsp_graph returns [] when LSP dependencies unavailable.""" - engine = HybridSearchEngine() - - with patch("codexlens.search.hybrid_search.HAS_LSP", False): - results = engine._search_lsp_graph( - index_path=tmp_index_path, - query="test", - limit=10, - max_depth=1, - max_nodes=20, - ) - - assert results == [] - - def test_graph_with_no_new_nodes_returns_empty( - self, - tmp_index_path: Path, - sample_search_result: SearchResult, - ) -> None: - """Test when graph only contains seed nodes (no expansion).""" - engine = HybridSearchEngine() - - # Create graph with ONLY the seed node (no related nodes) - mock_graph = create_mock_graph_with_seed_and_related( - sample_search_result, - [], # No related nodes - ) - - with patch.object( - engine, "_search_vector", return_value=[sample_search_result] - ): - with patch("codexlens.search.hybrid_search.HAS_LSP", True): - mock_bridge_instance = AsyncMock() - mock_bridge_class = MagicMock() - mock_bridge_class.return_value.__aenter__ = AsyncMock( - return_value=mock_bridge_instance - ) - mock_bridge_class.return_value.__aexit__ = AsyncMock( - return_value=None - ) - - async def mock_build(seeds, bridge): - return mock_graph - - mock_builder_instance = MagicMock() - mock_builder_instance.build_from_seeds = mock_build - mock_builder_class = MagicMock(return_value=mock_builder_instance) - - with patch( - "codexlens.search.hybrid_search.LspBridge", - mock_bridge_class, - ), patch( - "codexlens.search.hybrid_search.LspGraphBuilder", - mock_builder_class, - ): - results = engine._search_lsp_graph( - index_path=tmp_index_path, - query="test", - limit=10, - max_depth=1, - max_nodes=20, - ) - - # Should return empty since all nodes are seeds (filtered out) - assert results == [] diff --git a/codex-lens/tests/lsp/__init__.py b/codex-lens/tests/lsp/__init__.py deleted file mode 100644 index 5366a486..00000000 --- a/codex-lens/tests/lsp/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests package for LSP module.""" diff --git a/codex-lens/tests/lsp/test_hover.py b/codex-lens/tests/lsp/test_hover.py deleted file mode 100644 index 4d77a043..00000000 --- a/codex-lens/tests/lsp/test_hover.py +++ /dev/null @@ -1,477 +0,0 @@ -"""Tests for hover provider.""" - -from __future__ import annotations - -import pytest -from pathlib import Path -from unittest.mock import Mock, MagicMock -import tempfile - -from codexlens.entities import Symbol - - -class TestHoverInfo: - """Test HoverInfo dataclass.""" - - def test_hover_info_import(self): - """HoverInfo can be imported.""" - pytest.importorskip("pygls") - pytest.importorskip("lsprotocol") - - from codexlens.lsp.providers import HoverInfo - - assert HoverInfo is not None - - def test_hover_info_fields(self): - """HoverInfo has all required fields.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverInfo - - info = HoverInfo( - name="my_function", - kind="function", - signature="def my_function(x: int) -> str:", - documentation="A test function.", - file_path="/test/file.py", - line_range=(10, 15), - ) - assert info.name == "my_function" - assert info.kind == "function" - assert info.signature == "def my_function(x: int) -> str:" - assert info.documentation == "A test function." - assert info.file_path == "/test/file.py" - assert info.line_range == (10, 15) - - def test_hover_info_optional_documentation(self): - """Documentation can be None.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverInfo - - info = HoverInfo( - name="func", - kind="function", - signature="def func():", - documentation=None, - file_path="/test.py", - line_range=(1, 2), - ) - assert info.documentation is None - - -class TestHoverProvider: - """Test HoverProvider class.""" - - def test_provider_import(self): - """HoverProvider can be imported.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverProvider - - assert HoverProvider is not None - - def test_returns_none_for_unknown_symbol(self): - """Returns None when symbol not found.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverProvider - - mock_index = Mock() - mock_index.search.return_value = [] - mock_registry = Mock() - - provider = HoverProvider(mock_index, mock_registry) - result = provider.get_hover_info("unknown_symbol") - - assert result is None - mock_index.search.assert_called_once_with( - name="unknown_symbol", limit=1, prefix_mode=False - ) - - def test_returns_none_for_non_exact_match(self): - """Returns None when search returns non-exact matches.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverProvider - - # Return a symbol with different name (prefix match but not exact) - mock_symbol = Mock() - mock_symbol.name = "my_function_extended" - mock_symbol.kind = "function" - mock_symbol.file = "/test/file.py" - mock_symbol.range = (10, 15) - - mock_index = Mock() - mock_index.search.return_value = [mock_symbol] - mock_registry = Mock() - - provider = HoverProvider(mock_index, mock_registry) - result = provider.get_hover_info("my_function") - - assert result is None - - def test_returns_hover_info_for_known_symbol(self): - """Returns HoverInfo for found symbol.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverProvider - - mock_symbol = Mock() - mock_symbol.name = "my_func" - mock_symbol.kind = "function" - mock_symbol.file = None # No file, will use fallback signature - mock_symbol.range = (10, 15) - - mock_index = Mock() - mock_index.search.return_value = [mock_symbol] - mock_registry = Mock() - - provider = HoverProvider(mock_index, mock_registry) - result = provider.get_hover_info("my_func") - - assert result is not None - assert result.name == "my_func" - assert result.kind == "function" - assert result.line_range == (10, 15) - assert result.signature == "function my_func" - - def test_extracts_signature_from_file(self): - """Extracts signature from actual file content.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverProvider - - # Create a temporary file with Python content - with tempfile.NamedTemporaryFile( - mode="w", suffix=".py", delete=False, encoding="utf-8" - ) as f: - f.write("# comment\n") - f.write("def test_function(x: int, y: str) -> bool:\n") - f.write(" return True\n") - temp_path = f.name - - try: - mock_symbol = Mock() - mock_symbol.name = "test_function" - mock_symbol.kind = "function" - mock_symbol.file = temp_path - mock_symbol.range = (2, 3) # Line 2 (1-based) - - mock_index = Mock() - mock_index.search.return_value = [mock_symbol] - - provider = HoverProvider(mock_index, None) - result = provider.get_hover_info("test_function") - - assert result is not None - assert "def test_function(x: int, y: str) -> bool:" in result.signature - finally: - Path(temp_path).unlink(missing_ok=True) - - def test_extracts_multiline_signature(self): - """Extracts multiline function signature.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverProvider - - # Create a temporary file with multiline signature - with tempfile.NamedTemporaryFile( - mode="w", suffix=".py", delete=False, encoding="utf-8" - ) as f: - f.write("def complex_function(\n") - f.write(" arg1: int,\n") - f.write(" arg2: str,\n") - f.write(") -> bool:\n") - f.write(" return True\n") - temp_path = f.name - - try: - mock_symbol = Mock() - mock_symbol.name = "complex_function" - mock_symbol.kind = "function" - mock_symbol.file = temp_path - mock_symbol.range = (1, 5) # Line 1 (1-based) - - mock_index = Mock() - mock_index.search.return_value = [mock_symbol] - - provider = HoverProvider(mock_index, None) - result = provider.get_hover_info("complex_function") - - assert result is not None - assert "def complex_function(" in result.signature - # Should capture multiline signature - assert "arg1: int" in result.signature - finally: - Path(temp_path).unlink(missing_ok=True) - - def test_handles_nonexistent_file_gracefully(self): - """Returns fallback signature when file doesn't exist.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverProvider - - mock_symbol = Mock() - mock_symbol.name = "my_func" - mock_symbol.kind = "function" - mock_symbol.file = "/nonexistent/path/file.py" - mock_symbol.range = (10, 15) - - mock_index = Mock() - mock_index.search.return_value = [mock_symbol] - - provider = HoverProvider(mock_index, None) - result = provider.get_hover_info("my_func") - - assert result is not None - assert result.signature == "function my_func" - - def test_handles_invalid_line_range(self): - """Returns fallback signature when line range is invalid.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverProvider - - with tempfile.NamedTemporaryFile( - mode="w", suffix=".py", delete=False, encoding="utf-8" - ) as f: - f.write("def test():\n") - f.write(" pass\n") - temp_path = f.name - - try: - mock_symbol = Mock() - mock_symbol.name = "test" - mock_symbol.kind = "function" - mock_symbol.file = temp_path - mock_symbol.range = (100, 105) # Line beyond file length - - mock_index = Mock() - mock_index.search.return_value = [mock_symbol] - - provider = HoverProvider(mock_index, None) - result = provider.get_hover_info("test") - - assert result is not None - assert result.signature == "function test" - finally: - Path(temp_path).unlink(missing_ok=True) - - -class TestFormatHoverMarkdown: - """Test markdown formatting.""" - - def test_format_python_signature(self): - """Formats Python signature with python code fence.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverInfo, HoverProvider - - info = HoverInfo( - name="func", - kind="function", - signature="def func(x: int) -> str:", - documentation=None, - file_path="/test/file.py", - line_range=(10, 15), - ) - mock_index = Mock() - provider = HoverProvider(mock_index, None) - - result = provider.format_hover_markdown(info) - - assert "```python" in result - assert "def func(x: int) -> str:" in result - assert "function" in result - assert "file.py" in result - assert "line 10" in result - - def test_format_javascript_signature(self): - """Formats JavaScript signature with javascript code fence.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverInfo, HoverProvider - - info = HoverInfo( - name="myFunc", - kind="function", - signature="function myFunc(x) {", - documentation=None, - file_path="/test/file.js", - line_range=(5, 10), - ) - mock_index = Mock() - provider = HoverProvider(mock_index, None) - - result = provider.format_hover_markdown(info) - - assert "```javascript" in result - assert "function myFunc(x) {" in result - - def test_format_typescript_signature(self): - """Formats TypeScript signature with typescript code fence.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverInfo, HoverProvider - - info = HoverInfo( - name="myFunc", - kind="function", - signature="function myFunc(x: number): string {", - documentation=None, - file_path="/test/file.ts", - line_range=(5, 10), - ) - mock_index = Mock() - provider = HoverProvider(mock_index, None) - - result = provider.format_hover_markdown(info) - - assert "```typescript" in result - - def test_format_with_documentation(self): - """Includes documentation when available.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverInfo, HoverProvider - - info = HoverInfo( - name="func", - kind="function", - signature="def func():", - documentation="This is a test function.", - file_path="/test/file.py", - line_range=(10, 15), - ) - mock_index = Mock() - provider = HoverProvider(mock_index, None) - - result = provider.format_hover_markdown(info) - - assert "This is a test function." in result - assert "---" in result # Separator before docs - - def test_format_without_documentation(self): - """Does not include documentation section when None.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverInfo, HoverProvider - - info = HoverInfo( - name="func", - kind="function", - signature="def func():", - documentation=None, - file_path="/test/file.py", - line_range=(10, 15), - ) - mock_index = Mock() - provider = HoverProvider(mock_index, None) - - result = provider.format_hover_markdown(info) - - # Should have one separator for location, not two - # The result should not have duplicate doc separator - lines = result.split("\n") - separator_count = sum(1 for line in lines if line.strip() == "---") - assert separator_count == 1 # Only location separator - - def test_format_unknown_extension(self): - """Uses empty code fence for unknown file extensions.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverInfo, HoverProvider - - info = HoverInfo( - name="func", - kind="function", - signature="func code here", - documentation=None, - file_path="/test/file.xyz", - line_range=(1, 2), - ) - mock_index = Mock() - provider = HoverProvider(mock_index, None) - - result = provider.format_hover_markdown(info) - - # Should have code fence without language specifier - assert "```\n" in result or "```xyz" not in result - - def test_format_class_symbol(self): - """Formats class symbol correctly.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverInfo, HoverProvider - - info = HoverInfo( - name="MyClass", - kind="class", - signature="class MyClass:", - documentation=None, - file_path="/test/file.py", - line_range=(1, 20), - ) - mock_index = Mock() - provider = HoverProvider(mock_index, None) - - result = provider.format_hover_markdown(info) - - assert "class MyClass:" in result - assert "*class*" in result - assert "line 1" in result - - def test_format_empty_file_path(self): - """Handles empty file path gracefully.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverInfo, HoverProvider - - info = HoverInfo( - name="func", - kind="function", - signature="def func():", - documentation=None, - file_path="", - line_range=(1, 2), - ) - mock_index = Mock() - provider = HoverProvider(mock_index, None) - - result = provider.format_hover_markdown(info) - - assert "unknown" in result or "```" in result - - -class TestHoverProviderRegistry: - """Test HoverProvider with registry integration.""" - - def test_provider_accepts_none_registry(self): - """HoverProvider works without registry.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverProvider - - mock_index = Mock() - mock_index.search.return_value = [] - - provider = HoverProvider(mock_index, None) - result = provider.get_hover_info("test") - - assert result is None - assert provider.registry is None - - def test_provider_stores_registry(self): - """HoverProvider stores registry reference.""" - pytest.importorskip("pygls") - - from codexlens.lsp.providers import HoverProvider - - mock_index = Mock() - mock_registry = Mock() - - provider = HoverProvider(mock_index, mock_registry) - - assert provider.global_index is mock_index - assert provider.registry is mock_registry diff --git a/codex-lens/tests/lsp/test_lsp_edge_cases.py b/codex-lens/tests/lsp/test_lsp_edge_cases.py deleted file mode 100644 index 796b28cd..00000000 --- a/codex-lens/tests/lsp/test_lsp_edge_cases.py +++ /dev/null @@ -1,101 +0,0 @@ -"""LSP Edge Case Tests. - -This module tests edge cases and error conditions in LSP (Language Server Protocol) -operations, including timeout handling, protocol errors, and connection failures. - -Test Coverage: -- Timeout scenarios for LSP operations -- Protocol errors and malformed responses -- Connection failures and recovery -- Concurrent request handling -""" - -import pytest -from pathlib import Path -from unittest.mock import Mock, patch, MagicMock -import time - - -class TestLSPTimeouts: - """Test timeout handling in LSP operations.""" - - def test_hover_request_timeout(self): - """Test that hover requests timeout appropriately after configured duration.""" - # This is a placeholder for actual timeout testing - # Implementation requires mocking LSP client with delayed response - pytest.skip("Requires LSP server fixture setup") - - def test_definition_request_timeout(self): - """Test that go-to-definition requests timeout appropriately.""" - pytest.skip("Requires LSP server fixture setup") - - def test_references_request_timeout(self): - """Test that find-references requests timeout appropriately.""" - pytest.skip("Requires LSP server fixture setup") - - def test_concurrent_requests_with_timeout(self): - """Test behavior when multiple requests exceed timeout threshold.""" - pytest.skip("Requires LSP server fixture setup") - - -class TestLSPProtocolErrors: - """Test handling of LSP protocol errors.""" - - def test_malformed_json_response(self): - """Test handling of malformed JSON in LSP responses.""" - pytest.skip("Requires LSP client fixture") - - def test_invalid_method_error(self): - """Test handling of unknown/invalid method calls.""" - pytest.skip("Requires LSP client fixture") - - def test_missing_required_params(self): - """Test handling of responses with missing required parameters.""" - pytest.skip("Requires LSP client fixture") - - def test_null_result_handling(self): - """Test that null results from LSP are handled gracefully.""" - pytest.skip("Requires LSP client fixture") - - -class TestLSPConnectionFailures: - """Test LSP connection failure scenarios.""" - - def test_server_not_found(self): - """Test behavior when LSP server is not available.""" - pytest.skip("Requires LSP client fixture") - - def test_connection_dropped_mid_request(self): - """Test handling of dropped connections during active requests.""" - pytest.skip("Requires LSP client fixture") - - def test_connection_retry_logic(self): - """Test that connection retry logic works as expected.""" - pytest.skip("Requires LSP client fixture") - - def test_server_startup_failure(self): - """Test handling of LSP server startup failures.""" - pytest.skip("Requires LSP server fixture") - - -class TestLSPResourceLimits: - """Test LSP behavior under resource constraints.""" - - def test_large_file_handling(self): - """Test LSP operations on very large source files.""" - pytest.skip("Requires test file fixtures") - - def test_memory_pressure(self): - """Test LSP behavior under memory pressure.""" - pytest.skip("Requires memory simulation") - - def test_concurrent_request_limits(self): - """Test handling of too many concurrent LSP requests.""" - pytest.skip("Requires LSP client fixture") - - -# TODO: Implement actual tests using pytest fixtures and LSP mock objects -# The test infrastructure needs to be set up with: -# - LSP server fixture (maybe using pygls test server) -# - LSP client fixture with configurable delays/errors -# - Test file fixtures with various code patterns diff --git a/codex-lens/tests/lsp/test_packaging_metadata.py b/codex-lens/tests/lsp/test_packaging_metadata.py deleted file mode 100644 index b51d0d50..00000000 --- a/codex-lens/tests/lsp/test_packaging_metadata.py +++ /dev/null @@ -1,27 +0,0 @@ -"""Packaging metadata tests for codex-lens (LSP/semantic extras).""" - -from __future__ import annotations - -from pathlib import Path - - -def _read_pyproject() -> str: - repo_root = Path(__file__).resolve().parents[2] - return (repo_root / "pyproject.toml").read_text(encoding="utf-8") - - -def test_lsp_script_entrypoint_points_to_server_main() -> None: - pyproject = _read_pyproject() - assert 'codexlens-lsp = "codexlens.lsp.server:main"' in pyproject - - -def test_semantic_extras_do_not_pin_yanked_fastembed_020() -> None: - pyproject = _read_pyproject() - assert "fastembed~=0.2.0" not in pyproject - assert "fastembed~=0.2.1" in pyproject - - -def test_click_dependency_is_explicitly_guarded() -> None: - pyproject = _read_pyproject() - assert "click>=8.0.0,<9" in pyproject - diff --git a/codex-lens/tests/lsp/test_references.py b/codex-lens/tests/lsp/test_references.py deleted file mode 100644 index 78e04081..00000000 --- a/codex-lens/tests/lsp/test_references.py +++ /dev/null @@ -1,497 +0,0 @@ -"""Tests for reference search functionality. - -This module tests the ReferenceResult dataclass and search_references method -in ChainSearchEngine, as well as the updated lsp_references handler. -""" - -from __future__ import annotations - -import pytest -from pathlib import Path -from unittest.mock import Mock, MagicMock, patch -import sqlite3 -import tempfile -import os - - -class TestReferenceResult: - """Test ReferenceResult dataclass.""" - - def test_reference_result_fields(self): - """ReferenceResult has all required fields.""" - from codexlens.search.chain_search import ReferenceResult - - ref = ReferenceResult( - file_path="/test/file.py", - line=10, - column=5, - context="def foo():", - relationship_type="call", - ) - assert ref.file_path == "/test/file.py" - assert ref.line == 10 - assert ref.column == 5 - assert ref.context == "def foo():" - assert ref.relationship_type == "call" - - def test_reference_result_with_empty_context(self): - """ReferenceResult can have empty context.""" - from codexlens.search.chain_search import ReferenceResult - - ref = ReferenceResult( - file_path="/test/file.py", - line=1, - column=0, - context="", - relationship_type="import", - ) - assert ref.context == "" - - def test_reference_result_different_relationship_types(self): - """ReferenceResult supports different relationship types.""" - from codexlens.search.chain_search import ReferenceResult - - types = ["call", "import", "inheritance", "implementation", "usage"] - for rel_type in types: - ref = ReferenceResult( - file_path="/test/file.py", - line=1, - column=0, - context="test", - relationship_type=rel_type, - ) - assert ref.relationship_type == rel_type - - -class TestExtractContext: - """Test the _extract_context helper method.""" - - def test_extract_context_middle_of_file(self): - """Extract context from middle of file.""" - from codexlens.search.chain_search import ChainSearchEngine, ReferenceResult - - content = "\n".join([ - "line 1", - "line 2", - "line 3", - "line 4", # target line - "line 5", - "line 6", - "line 7", - ]) - - # Create minimal mock engine to test _extract_context - mock_registry = Mock() - mock_mapper = Mock() - - engine = ChainSearchEngine(mock_registry, mock_mapper) - context = engine._extract_context(content, line=4, context_lines=2) - - assert "line 2" in context - assert "line 3" in context - assert "line 4" in context - assert "line 5" in context - assert "line 6" in context - - def test_extract_context_start_of_file(self): - """Extract context at start of file.""" - from codexlens.search.chain_search import ChainSearchEngine - - content = "\n".join([ - "line 1", # target - "line 2", - "line 3", - "line 4", - ]) - - mock_registry = Mock() - mock_mapper = Mock() - - engine = ChainSearchEngine(mock_registry, mock_mapper) - context = engine._extract_context(content, line=1, context_lines=2) - - assert "line 1" in context - assert "line 2" in context - assert "line 3" in context - - def test_extract_context_end_of_file(self): - """Extract context at end of file.""" - from codexlens.search.chain_search import ChainSearchEngine - - content = "\n".join([ - "line 1", - "line 2", - "line 3", - "line 4", # target - ]) - - mock_registry = Mock() - mock_mapper = Mock() - - engine = ChainSearchEngine(mock_registry, mock_mapper) - context = engine._extract_context(content, line=4, context_lines=2) - - assert "line 2" in context - assert "line 3" in context - assert "line 4" in context - - def test_extract_context_empty_content(self): - """Extract context from empty content.""" - from codexlens.search.chain_search import ChainSearchEngine - - mock_registry = Mock() - mock_mapper = Mock() - - engine = ChainSearchEngine(mock_registry, mock_mapper) - context = engine._extract_context("", line=1, context_lines=3) - - assert context == "" - - def test_extract_context_invalid_line(self): - """Extract context with invalid line number.""" - from codexlens.search.chain_search import ChainSearchEngine - - content = "line 1\nline 2\nline 3" - - mock_registry = Mock() - mock_mapper = Mock() - - engine = ChainSearchEngine(mock_registry, mock_mapper) - - # Line 0 (invalid) - assert engine._extract_context(content, line=0, context_lines=1) == "" - - # Line beyond end - assert engine._extract_context(content, line=100, context_lines=1) == "" - - -class TestSearchReferences: - """Test search_references method.""" - - def test_returns_empty_for_no_source_path_and_no_registry(self): - """Returns empty list when no source path and registry has no mappings.""" - from codexlens.search.chain_search import ChainSearchEngine - - mock_registry = Mock() - mock_registry.list_mappings.return_value = [] - mock_mapper = Mock() - - engine = ChainSearchEngine(mock_registry, mock_mapper) - results = engine.search_references("test_symbol") - - assert results == [] - - def test_returns_empty_for_no_indexes(self): - """Returns empty list when no indexes found.""" - from codexlens.search.chain_search import ChainSearchEngine - - mock_registry = Mock() - mock_mapper = Mock() - mock_mapper.source_to_index_db.return_value = Path("/nonexistent/_index.db") - - engine = ChainSearchEngine(mock_registry, mock_mapper) - - with patch.object(engine, "_find_start_index", return_value=None): - results = engine.search_references("test_symbol", Path("/some/path")) - - assert results == [] - - def test_deduplicates_results(self): - """Removes duplicate file:line references.""" - from codexlens.search.chain_search import ChainSearchEngine, ReferenceResult - - mock_registry = Mock() - mock_mapper = Mock() - - engine = ChainSearchEngine(mock_registry, mock_mapper) - - # Create a temporary database with duplicate relationships - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "_index.db" - conn = sqlite3.connect(str(db_path)) - conn.executescript(""" - CREATE TABLE files ( - id INTEGER PRIMARY KEY, - path TEXT NOT NULL, - language TEXT NOT NULL, - content TEXT NOT NULL - ); - CREATE TABLE symbols ( - id INTEGER PRIMARY KEY, - file_id INTEGER NOT NULL, - name TEXT NOT NULL, - kind TEXT NOT NULL, - start_line INTEGER NOT NULL, - end_line INTEGER NOT NULL - ); - CREATE TABLE code_relationships ( - id INTEGER PRIMARY KEY, - source_symbol_id INTEGER NOT NULL, - target_qualified_name TEXT NOT NULL, - relationship_type TEXT NOT NULL, - source_line INTEGER NOT NULL, - target_file TEXT - ); - - INSERT INTO files VALUES (1, '/test/file.py', 'python', 'def test(): pass'); - INSERT INTO symbols VALUES (1, 1, 'test_func', 'function', 1, 1); - INSERT INTO code_relationships VALUES (1, 1, 'target_func', 'call', 10, NULL); - INSERT INTO code_relationships VALUES (2, 1, 'target_func', 'call', 10, NULL); - """) - conn.commit() - conn.close() - - with patch.object(engine, "_find_start_index", return_value=db_path): - with patch.object(engine, "_collect_index_paths", return_value=[db_path]): - results = engine.search_references("target_func", Path(tmpdir)) - - # Should only have 1 result due to deduplication - assert len(results) == 1 - assert results[0].line == 10 - - def test_sorts_by_file_and_line(self): - """Results sorted by file path then line number.""" - from codexlens.search.chain_search import ChainSearchEngine, ReferenceResult - - mock_registry = Mock() - mock_mapper = Mock() - - engine = ChainSearchEngine(mock_registry, mock_mapper) - - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "_index.db" - conn = sqlite3.connect(str(db_path)) - conn.executescript(""" - CREATE TABLE files ( - id INTEGER PRIMARY KEY, - path TEXT NOT NULL, - language TEXT NOT NULL, - content TEXT NOT NULL - ); - CREATE TABLE symbols ( - id INTEGER PRIMARY KEY, - file_id INTEGER NOT NULL, - name TEXT NOT NULL, - kind TEXT NOT NULL, - start_line INTEGER NOT NULL, - end_line INTEGER NOT NULL - ); - CREATE TABLE code_relationships ( - id INTEGER PRIMARY KEY, - source_symbol_id INTEGER NOT NULL, - target_qualified_name TEXT NOT NULL, - relationship_type TEXT NOT NULL, - source_line INTEGER NOT NULL, - target_file TEXT - ); - - INSERT INTO files VALUES (1, '/test/b_file.py', 'python', 'content'); - INSERT INTO files VALUES (2, '/test/a_file.py', 'python', 'content'); - INSERT INTO symbols VALUES (1, 1, 'func1', 'function', 1, 1); - INSERT INTO symbols VALUES (2, 2, 'func2', 'function', 1, 1); - INSERT INTO code_relationships VALUES (1, 1, 'target', 'call', 20, NULL); - INSERT INTO code_relationships VALUES (2, 1, 'target', 'call', 10, NULL); - INSERT INTO code_relationships VALUES (3, 2, 'target', 'call', 5, NULL); - """) - conn.commit() - conn.close() - - with patch.object(engine, "_find_start_index", return_value=db_path): - with patch.object(engine, "_collect_index_paths", return_value=[db_path]): - results = engine.search_references("target", Path(tmpdir)) - - # Should be sorted: a_file.py:5, b_file.py:10, b_file.py:20 - assert len(results) == 3 - assert results[0].file_path == "/test/a_file.py" - assert results[0].line == 5 - assert results[1].file_path == "/test/b_file.py" - assert results[1].line == 10 - assert results[2].file_path == "/test/b_file.py" - assert results[2].line == 20 - - def test_respects_limit(self): - """Returns at most limit results.""" - from codexlens.search.chain_search import ChainSearchEngine - - mock_registry = Mock() - mock_mapper = Mock() - - engine = ChainSearchEngine(mock_registry, mock_mapper) - - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "_index.db" - conn = sqlite3.connect(str(db_path)) - conn.executescript(""" - CREATE TABLE files ( - id INTEGER PRIMARY KEY, - path TEXT NOT NULL, - language TEXT NOT NULL, - content TEXT NOT NULL - ); - CREATE TABLE symbols ( - id INTEGER PRIMARY KEY, - file_id INTEGER NOT NULL, - name TEXT NOT NULL, - kind TEXT NOT NULL, - start_line INTEGER NOT NULL, - end_line INTEGER NOT NULL - ); - CREATE TABLE code_relationships ( - id INTEGER PRIMARY KEY, - source_symbol_id INTEGER NOT NULL, - target_qualified_name TEXT NOT NULL, - relationship_type TEXT NOT NULL, - source_line INTEGER NOT NULL, - target_file TEXT - ); - - INSERT INTO files VALUES (1, '/test/file.py', 'python', 'content'); - INSERT INTO symbols VALUES (1, 1, 'func', 'function', 1, 1); - """) - # Insert many relationships - for i in range(50): - conn.execute( - "INSERT INTO code_relationships VALUES (?, 1, 'target', 'call', ?, NULL)", - (i + 1, i + 1) - ) - conn.commit() - conn.close() - - with patch.object(engine, "_find_start_index", return_value=db_path): - with patch.object(engine, "_collect_index_paths", return_value=[db_path]): - results = engine.search_references("target", Path(tmpdir), limit=10) - - assert len(results) == 10 - - def test_matches_qualified_name(self): - """Matches symbols by qualified name suffix.""" - from codexlens.search.chain_search import ChainSearchEngine - - mock_registry = Mock() - mock_mapper = Mock() - - engine = ChainSearchEngine(mock_registry, mock_mapper) - - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "_index.db" - conn = sqlite3.connect(str(db_path)) - conn.executescript(""" - CREATE TABLE files ( - id INTEGER PRIMARY KEY, - path TEXT NOT NULL, - language TEXT NOT NULL, - content TEXT NOT NULL - ); - CREATE TABLE symbols ( - id INTEGER PRIMARY KEY, - file_id INTEGER NOT NULL, - name TEXT NOT NULL, - kind TEXT NOT NULL, - start_line INTEGER NOT NULL, - end_line INTEGER NOT NULL - ); - CREATE TABLE code_relationships ( - id INTEGER PRIMARY KEY, - source_symbol_id INTEGER NOT NULL, - target_qualified_name TEXT NOT NULL, - relationship_type TEXT NOT NULL, - source_line INTEGER NOT NULL, - target_file TEXT - ); - - INSERT INTO files VALUES (1, '/test/file.py', 'python', 'content'); - INSERT INTO symbols VALUES (1, 1, 'caller', 'function', 1, 1); - -- Fully qualified name - INSERT INTO code_relationships VALUES (1, 1, 'module.submodule.target_func', 'call', 10, NULL); - -- Simple name - INSERT INTO code_relationships VALUES (2, 1, 'target_func', 'call', 20, NULL); - """) - conn.commit() - conn.close() - - with patch.object(engine, "_find_start_index", return_value=db_path): - with patch.object(engine, "_collect_index_paths", return_value=[db_path]): - results = engine.search_references("target_func", Path(tmpdir)) - - # Should find both references - assert len(results) == 2 - - -class TestLspReferencesHandler: - """Test the LSP references handler.""" - - def test_handler_uses_search_engine(self): - """Handler uses search_engine.search_references when available.""" - pytest.importorskip("pygls") - pytest.importorskip("lsprotocol") - - from lsprotocol import types as lsp - from codexlens.lsp.handlers import _path_to_uri - from codexlens.search.chain_search import ReferenceResult - - # Create mock references - mock_references = [ - ReferenceResult( - file_path="/test/file1.py", - line=10, - column=5, - context="def foo():", - relationship_type="call", - ), - ReferenceResult( - file_path="/test/file2.py", - line=20, - column=0, - context="import foo", - relationship_type="import", - ), - ] - - # Verify conversion to LSP Location - locations = [] - for ref in mock_references: - locations.append( - lsp.Location( - uri=_path_to_uri(ref.file_path), - range=lsp.Range( - start=lsp.Position( - line=max(0, ref.line - 1), - character=ref.column, - ), - end=lsp.Position( - line=max(0, ref.line - 1), - character=ref.column + len("foo"), - ), - ), - ) - ) - - assert len(locations) == 2 - # First reference at line 10 (0-indexed = 9) - assert locations[0].range.start.line == 9 - assert locations[0].range.start.character == 5 - # Second reference at line 20 (0-indexed = 19) - assert locations[1].range.start.line == 19 - assert locations[1].range.start.character == 0 - - def test_handler_falls_back_to_global_index(self): - """Handler falls back to global_index when search_engine unavailable.""" - pytest.importorskip("pygls") - pytest.importorskip("lsprotocol") - - from codexlens.lsp.handlers import symbol_to_location - from codexlens.entities import Symbol - - # Test fallback path converts Symbol to Location - symbol = Symbol( - name="test_func", - kind="function", - range=(10, 15), - file="/test/file.py", - ) - - location = symbol_to_location(symbol) - assert location is not None - # LSP uses 0-based lines - assert location.range.start.line == 9 - assert location.range.end.line == 14 diff --git a/codex-lens/tests/lsp/test_server.py b/codex-lens/tests/lsp/test_server.py deleted file mode 100644 index a1b8000d..00000000 --- a/codex-lens/tests/lsp/test_server.py +++ /dev/null @@ -1,210 +0,0 @@ -"""Tests for codex-lens LSP server.""" - -from __future__ import annotations - -import pytest -from pathlib import Path -from unittest.mock import MagicMock, patch - -from codexlens.entities import Symbol - - -class TestCodexLensLanguageServer: - """Tests for CodexLensLanguageServer.""" - - def test_server_import(self): - """Test that server module can be imported.""" - pytest.importorskip("pygls") - pytest.importorskip("lsprotocol") - - from codexlens.lsp.server import CodexLensLanguageServer, server - - assert CodexLensLanguageServer is not None - assert server is not None - assert server.name == "codexlens-lsp" - - def test_server_initialization(self): - """Test server instance creation.""" - pytest.importorskip("pygls") - - from codexlens.lsp.server import CodexLensLanguageServer - - ls = CodexLensLanguageServer() - assert ls.registry is None - assert ls.mapper is None - assert ls.global_index is None - assert ls.search_engine is None - assert ls.workspace_root is None - - -class TestDefinitionHandler: - """Tests for definition handler.""" - - def test_definition_lookup(self): - """Test definition lookup returns location for known symbol.""" - pytest.importorskip("pygls") - pytest.importorskip("lsprotocol") - - from lsprotocol import types as lsp - from codexlens.lsp.handlers import symbol_to_location - - symbol = Symbol( - name="test_function", - kind="function", - range=(10, 15), - file="/path/to/file.py", - ) - - location = symbol_to_location(symbol) - - assert location is not None - assert isinstance(location, lsp.Location) - # LSP uses 0-based lines - assert location.range.start.line == 9 - assert location.range.end.line == 14 - - def test_definition_no_file(self): - """Test definition lookup returns None for symbol without file.""" - pytest.importorskip("pygls") - - from codexlens.lsp.handlers import symbol_to_location - - symbol = Symbol( - name="test_function", - kind="function", - range=(10, 15), - file=None, - ) - - location = symbol_to_location(symbol) - assert location is None - - -class TestCompletionHandler: - """Tests for completion handler.""" - - def test_get_prefix_at_position(self): - """Test extracting prefix at cursor position.""" - pytest.importorskip("pygls") - - from codexlens.lsp.handlers import _get_prefix_at_position - - document_text = "def hello_world():\n print(hel" - - # Cursor at end of "hel" - prefix = _get_prefix_at_position(document_text, 1, 14) - assert prefix == "hel" - - # Cursor at beginning of line (after whitespace) - prefix = _get_prefix_at_position(document_text, 1, 4) - assert prefix == "" - - # Cursor after "he" in "hello_world" - returns text before cursor - prefix = _get_prefix_at_position(document_text, 0, 6) - assert prefix == "he" - - # Cursor at end of "hello_world" - prefix = _get_prefix_at_position(document_text, 0, 15) - assert prefix == "hello_world" - - def test_get_word_at_position(self): - """Test extracting word at cursor position.""" - pytest.importorskip("pygls") - - from codexlens.lsp.handlers import _get_word_at_position - - document_text = "def hello_world():\n print(msg)" - - # Cursor on "hello_world" - word = _get_word_at_position(document_text, 0, 6) - assert word == "hello_world" - - # Cursor on "print" - word = _get_word_at_position(document_text, 1, 6) - assert word == "print" - - # Cursor on "msg" - word = _get_word_at_position(document_text, 1, 11) - assert word == "msg" - - def test_symbol_kind_mapping(self): - """Test symbol kind to completion kind mapping.""" - pytest.importorskip("pygls") - pytest.importorskip("lsprotocol") - - from lsprotocol import types as lsp - from codexlens.lsp.handlers import _symbol_kind_to_completion_kind - - assert _symbol_kind_to_completion_kind("function") == lsp.CompletionItemKind.Function - assert _symbol_kind_to_completion_kind("class") == lsp.CompletionItemKind.Class - assert _symbol_kind_to_completion_kind("method") == lsp.CompletionItemKind.Method - assert _symbol_kind_to_completion_kind("variable") == lsp.CompletionItemKind.Variable - - # Unknown kind should default to Text - assert _symbol_kind_to_completion_kind("unknown") == lsp.CompletionItemKind.Text - - -class TestWorkspaceSymbolHandler: - """Tests for workspace symbol handler.""" - - def test_symbol_kind_to_lsp(self): - """Test symbol kind to LSP SymbolKind mapping.""" - pytest.importorskip("pygls") - pytest.importorskip("lsprotocol") - - from lsprotocol import types as lsp - from codexlens.lsp.handlers import _symbol_kind_to_lsp - - assert _symbol_kind_to_lsp("function") == lsp.SymbolKind.Function - assert _symbol_kind_to_lsp("class") == lsp.SymbolKind.Class - assert _symbol_kind_to_lsp("method") == lsp.SymbolKind.Method - assert _symbol_kind_to_lsp("interface") == lsp.SymbolKind.Interface - - # Unknown kind should default to Variable - assert _symbol_kind_to_lsp("unknown") == lsp.SymbolKind.Variable - - -class TestUriConversion: - """Tests for URI path conversion.""" - - def test_path_to_uri(self): - """Test path to URI conversion.""" - pytest.importorskip("pygls") - - from codexlens.lsp.handlers import _path_to_uri - - # Unix path - uri = _path_to_uri("/home/user/file.py") - assert uri.startswith("file://") - assert "file.py" in uri - - def test_uri_to_path(self): - """Test URI to path conversion.""" - pytest.importorskip("pygls") - - from codexlens.lsp.handlers import _uri_to_path - - # Basic URI - path = _uri_to_path("file:///home/user/file.py") - assert path.name == "file.py" - - -class TestMainEntryPoint: - """Tests for main entry point.""" - - def test_main_help(self): - """Test that main shows help without errors.""" - pytest.importorskip("pygls") - - import sys - from unittest.mock import patch - - # Patch sys.argv to show help - with patch.object(sys, 'argv', ['codexlens-lsp', '--help']): - from codexlens.lsp.server import main - - with pytest.raises(SystemExit) as exc_info: - main() - - # Help exits with 0 - assert exc_info.value.code == 0 diff --git a/codex-lens/tests/lsp/test_standalone_manager_defaults.py b/codex-lens/tests/lsp/test_standalone_manager_defaults.py deleted file mode 100644 index fe0a9cb6..00000000 --- a/codex-lens/tests/lsp/test_standalone_manager_defaults.py +++ /dev/null @@ -1,31 +0,0 @@ -"""Tests for StandaloneLspManager default config behavior.""" - -from __future__ import annotations - -import asyncio -import logging -from pathlib import Path - -import pytest - -from codexlens.lsp.standalone_manager import StandaloneLspManager - - -def test_loads_builtin_defaults_when_no_config_found( - tmp_path: Path, caplog: pytest.LogCaptureFixture -) -> None: - manager = StandaloneLspManager(workspace_root=str(tmp_path)) - - with caplog.at_level(logging.INFO): - asyncio.run(manager.start()) - - assert manager._configs # type: ignore[attr-defined] - assert manager.get_language_id(str(tmp_path / "example.py")) == "python" - - expected_root = str(tmp_path / "lsp-servers.json") - expected_codexlens = str(tmp_path / ".codexlens" / "lsp-servers.json") - - assert "using built-in defaults" in caplog.text.lower() - assert expected_root in caplog.text - assert expected_codexlens in caplog.text - diff --git a/codex-lens/tests/lsp/test_standalone_manager_paths.py b/codex-lens/tests/lsp/test_standalone_manager_paths.py deleted file mode 100644 index 39b74584..00000000 --- a/codex-lens/tests/lsp/test_standalone_manager_paths.py +++ /dev/null @@ -1,48 +0,0 @@ -"""Tests for StandaloneLspManager path normalization (Windows URI handling).""" - -from __future__ import annotations - -import platform - -from codexlens.lsp.standalone_manager import StandaloneLspManager - - -def test_normalize_file_uri_percent_encoded_windows_drive() -> None: - if platform.system() != "Windows": - return - - manager = StandaloneLspManager(workspace_root="D:/Claude_dms3/codex-lens") - - raw = "file:///d%3A/Claude_dms3/codex-lens/src/codexlens/lsp/standalone_manager.py" - normalized = manager._normalize_file_path(raw) - - assert normalized.lower().startswith("d:/") - assert "%3a" not in normalized.lower() - assert "d%3a" not in normalized.lower() - assert "/d%3a" not in normalized.lower() - - -def test_normalize_uri_path_percent_encoded_windows_drive() -> None: - if platform.system() != "Windows": - return - - manager = StandaloneLspManager(workspace_root="D:/Claude_dms3/codex-lens") - - raw = "/d%3A/Claude_dms3/codex-lens/src/codexlens/lsp/standalone_manager.py" - normalized = manager._normalize_file_path(raw) - - assert normalized.lower().startswith("d:/") - assert "%3a" not in normalized.lower() - - -def test_normalize_plain_windows_path_is_unchanged() -> None: - if platform.system() != "Windows": - return - - manager = StandaloneLspManager(workspace_root="D:/Claude_dms3/codex-lens") - - raw = r"D:\Claude_dms3\codex-lens\src\codexlens\lsp\standalone_manager.py" - normalized = manager._normalize_file_path(raw) - - assert normalized == raw - diff --git a/codex-lens/tests/mcp/__init__.py b/codex-lens/tests/mcp/__init__.py deleted file mode 100644 index 2fa2b8ff..00000000 --- a/codex-lens/tests/mcp/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for MCP (Model Context Protocol) module.""" diff --git a/codex-lens/tests/mcp/test_hooks.py b/codex-lens/tests/mcp/test_hooks.py deleted file mode 100644 index 4a650ed1..00000000 --- a/codex-lens/tests/mcp/test_hooks.py +++ /dev/null @@ -1,208 +0,0 @@ -"""Tests for MCP hooks module.""" - -import pytest -from unittest.mock import Mock, patch -from pathlib import Path - -from codexlens.mcp.hooks import HookManager, create_context_for_prompt -from codexlens.mcp.schema import MCPContext, SymbolInfo - - -class TestHookManager: - """Test HookManager class.""" - - @pytest.fixture - def mock_provider(self): - """Create a mock MCP provider.""" - provider = Mock() - provider.build_context.return_value = MCPContext( - symbol=SymbolInfo("test_func", "function", "/test.py", 1, 10), - context_type="symbol_explanation", - ) - provider.build_context_for_file.return_value = MCPContext( - context_type="file_overview", - ) - return provider - - @pytest.fixture - def hook_manager(self, mock_provider): - """Create a HookManager with mocked provider.""" - return HookManager(mock_provider) - - def test_default_hooks_registered(self, hook_manager): - """Default hooks are registered on initialization.""" - assert "explain" in hook_manager._pre_hooks - assert "refactor" in hook_manager._pre_hooks - assert "document" in hook_manager._pre_hooks - - def test_execute_pre_hook_returns_context(self, hook_manager, mock_provider): - """execute_pre_hook returns MCPContext for registered hook.""" - result = hook_manager.execute_pre_hook("explain", {"symbol": "my_func"}) - - assert result is not None - assert isinstance(result, MCPContext) - mock_provider.build_context.assert_called_once() - - def test_execute_pre_hook_returns_none_for_unknown_action(self, hook_manager): - """execute_pre_hook returns None for unregistered action.""" - result = hook_manager.execute_pre_hook("unknown_action", {"symbol": "test"}) - - assert result is None - - def test_execute_pre_hook_handles_exception(self, hook_manager, mock_provider): - """execute_pre_hook handles provider exceptions gracefully.""" - mock_provider.build_context.side_effect = Exception("Provider failed") - - result = hook_manager.execute_pre_hook("explain", {"symbol": "my_func"}) - - assert result is None - - def test_execute_post_hook_no_error_for_unregistered(self, hook_manager): - """execute_post_hook doesn't error for unregistered action.""" - # Should not raise - hook_manager.execute_post_hook("unknown", {"result": "data"}) - - def test_pre_explain_hook_calls_build_context(self, hook_manager, mock_provider): - """_pre_explain_hook calls build_context correctly.""" - hook_manager.execute_pre_hook("explain", {"symbol": "my_func"}) - - mock_provider.build_context.assert_called_with( - symbol_name="my_func", - context_type="symbol_explanation", - include_references=True, - include_related=True, - ) - - def test_pre_explain_hook_returns_none_without_symbol(self, hook_manager, mock_provider): - """_pre_explain_hook returns None when symbol param missing.""" - result = hook_manager.execute_pre_hook("explain", {}) - - assert result is None - mock_provider.build_context.assert_not_called() - - def test_pre_refactor_hook_calls_build_context(self, hook_manager, mock_provider): - """_pre_refactor_hook calls build_context with refactor settings.""" - hook_manager.execute_pre_hook("refactor", {"symbol": "my_class"}) - - mock_provider.build_context.assert_called_with( - symbol_name="my_class", - context_type="refactor_context", - include_references=True, - include_related=True, - max_references=20, - ) - - def test_pre_refactor_hook_returns_none_without_symbol(self, hook_manager, mock_provider): - """_pre_refactor_hook returns None when symbol param missing.""" - result = hook_manager.execute_pre_hook("refactor", {}) - - assert result is None - mock_provider.build_context.assert_not_called() - - def test_pre_document_hook_with_symbol(self, hook_manager, mock_provider): - """_pre_document_hook uses build_context when symbol provided.""" - hook_manager.execute_pre_hook("document", {"symbol": "my_func"}) - - mock_provider.build_context.assert_called_with( - symbol_name="my_func", - context_type="documentation_context", - include_references=False, - include_related=True, - ) - - def test_pre_document_hook_with_file_path(self, hook_manager, mock_provider): - """_pre_document_hook uses build_context_for_file when file_path provided.""" - hook_manager.execute_pre_hook("document", {"file_path": "/src/module.py"}) - - mock_provider.build_context_for_file.assert_called_once() - call_args = mock_provider.build_context_for_file.call_args - assert call_args[0][0] == Path("/src/module.py") - assert call_args[1].get("context_type") == "file_documentation" - - def test_pre_document_hook_prefers_symbol_over_file(self, hook_manager, mock_provider): - """_pre_document_hook prefers symbol when both provided.""" - hook_manager.execute_pre_hook( - "document", {"symbol": "my_func", "file_path": "/src/module.py"} - ) - - mock_provider.build_context.assert_called_once() - mock_provider.build_context_for_file.assert_not_called() - - def test_pre_document_hook_returns_none_without_params(self, hook_manager, mock_provider): - """_pre_document_hook returns None when neither symbol nor file_path provided.""" - result = hook_manager.execute_pre_hook("document", {}) - - assert result is None - mock_provider.build_context.assert_not_called() - mock_provider.build_context_for_file.assert_not_called() - - def test_register_pre_hook(self, hook_manager): - """register_pre_hook adds custom hook.""" - custom_hook = Mock(return_value=MCPContext()) - - hook_manager.register_pre_hook("custom_action", custom_hook) - - assert "custom_action" in hook_manager._pre_hooks - hook_manager.execute_pre_hook("custom_action", {"data": "value"}) - custom_hook.assert_called_once_with({"data": "value"}) - - def test_register_post_hook(self, hook_manager): - """register_post_hook adds custom hook.""" - custom_hook = Mock() - - hook_manager.register_post_hook("custom_action", custom_hook) - - assert "custom_action" in hook_manager._post_hooks - hook_manager.execute_post_hook("custom_action", {"result": "data"}) - custom_hook.assert_called_once_with({"result": "data"}) - - def test_execute_post_hook_handles_exception(self, hook_manager): - """execute_post_hook handles hook exceptions gracefully.""" - failing_hook = Mock(side_effect=Exception("Hook failed")) - hook_manager.register_post_hook("failing", failing_hook) - - # Should not raise - hook_manager.execute_post_hook("failing", {"data": "value"}) - - -class TestCreateContextForPrompt: - """Test create_context_for_prompt function.""" - - def test_returns_prompt_injection_string(self): - """create_context_for_prompt returns formatted string.""" - mock_provider = Mock() - mock_provider.build_context.return_value = MCPContext( - symbol=SymbolInfo("test_func", "function", "/test.py", 1, 10), - definition="def test_func(): pass", - ) - - result = create_context_for_prompt( - mock_provider, "explain", {"symbol": "test_func"} - ) - - assert isinstance(result, str) - assert "" in result - assert "test_func" in result - assert "" in result - - def test_returns_empty_string_when_no_context(self): - """create_context_for_prompt returns empty string when no context built.""" - mock_provider = Mock() - mock_provider.build_context.return_value = None - - result = create_context_for_prompt( - mock_provider, "explain", {"symbol": "nonexistent"} - ) - - assert result == "" - - def test_returns_empty_string_for_unknown_action(self): - """create_context_for_prompt returns empty string for unregistered action.""" - mock_provider = Mock() - - result = create_context_for_prompt( - mock_provider, "unknown_action", {"data": "value"} - ) - - assert result == "" - mock_provider.build_context.assert_not_called() diff --git a/codex-lens/tests/mcp/test_provider.py b/codex-lens/tests/mcp/test_provider.py deleted file mode 100644 index 4f5004a6..00000000 --- a/codex-lens/tests/mcp/test_provider.py +++ /dev/null @@ -1,383 +0,0 @@ -"""Tests for MCP provider.""" - -import pytest -from unittest.mock import Mock, MagicMock, patch -from pathlib import Path -import tempfile -import os - -from codexlens.mcp.provider import MCPProvider -from codexlens.mcp.schema import MCPContext, SymbolInfo, ReferenceInfo - - -class TestMCPProvider: - """Test MCPProvider class.""" - - @pytest.fixture - def mock_global_index(self): - """Create a mock global index.""" - return Mock() - - @pytest.fixture - def mock_search_engine(self): - """Create a mock search engine.""" - return Mock() - - @pytest.fixture - def mock_registry(self): - """Create a mock registry.""" - return Mock() - - @pytest.fixture - def provider(self, mock_global_index, mock_search_engine, mock_registry): - """Create an MCPProvider with mocked dependencies.""" - return MCPProvider(mock_global_index, mock_search_engine, mock_registry) - - def test_build_context_returns_none_for_unknown_symbol(self, provider, mock_global_index): - """build_context returns None when symbol is not found.""" - mock_global_index.search.return_value = [] - - result = provider.build_context("unknown_symbol") - - assert result is None - mock_global_index.search.assert_called_once_with( - "unknown_symbol", prefix_mode=False, limit=1 - ) - - def test_build_context_returns_mcp_context( - self, provider, mock_global_index, mock_search_engine - ): - """build_context returns MCPContext for known symbol.""" - mock_symbol = Mock() - mock_symbol.name = "my_func" - mock_symbol.kind = "function" - mock_symbol.file = "/test.py" - mock_symbol.range = (10, 20) - - mock_global_index.search.return_value = [mock_symbol] - mock_search_engine.search_references.return_value = [] - - result = provider.build_context("my_func") - - assert result is not None - assert isinstance(result, MCPContext) - assert result.symbol is not None - assert result.symbol.name == "my_func" - assert result.symbol.kind == "function" - assert result.context_type == "symbol_explanation" - - def test_build_context_with_custom_context_type( - self, provider, mock_global_index, mock_search_engine - ): - """build_context respects custom context_type.""" - mock_symbol = Mock() - mock_symbol.name = "my_func" - mock_symbol.kind = "function" - mock_symbol.file = "/test.py" - mock_symbol.range = (10, 20) - - mock_global_index.search.return_value = [mock_symbol] - mock_search_engine.search_references.return_value = [] - - result = provider.build_context("my_func", context_type="refactor_context") - - assert result is not None - assert result.context_type == "refactor_context" - - def test_build_context_includes_references( - self, provider, mock_global_index, mock_search_engine - ): - """build_context includes references when include_references=True.""" - mock_symbol = Mock() - mock_symbol.name = "my_func" - mock_symbol.kind = "function" - mock_symbol.file = "/test.py" - mock_symbol.range = (10, 20) - - mock_ref = Mock() - mock_ref.file_path = "/caller.py" - mock_ref.line = 25 - mock_ref.column = 4 - mock_ref.context = "result = my_func()" - mock_ref.relationship_type = "call" - - mock_global_index.search.return_value = [mock_symbol] - mock_search_engine.search_references.return_value = [mock_ref] - - result = provider.build_context("my_func", include_references=True) - - assert result is not None - assert len(result.references) == 1 - assert result.references[0].file_path == "/caller.py" - assert result.references[0].line == 25 - assert result.references[0].relationship_type == "call" - - def test_build_context_excludes_references_when_disabled( - self, provider, mock_global_index, mock_search_engine - ): - """build_context excludes references when include_references=False.""" - mock_symbol = Mock() - mock_symbol.name = "my_func" - mock_symbol.kind = "function" - mock_symbol.file = "/test.py" - mock_symbol.range = (10, 20) - - mock_global_index.search.return_value = [mock_symbol] - mock_search_engine.search_references.return_value = [] - - # Disable both references and related to avoid any search_references calls - result = provider.build_context( - "my_func", include_references=False, include_related=False - ) - - assert result is not None - assert len(result.references) == 0 - mock_search_engine.search_references.assert_not_called() - - def test_build_context_respects_max_references( - self, provider, mock_global_index, mock_search_engine - ): - """build_context passes max_references to search engine.""" - mock_symbol = Mock() - mock_symbol.name = "my_func" - mock_symbol.kind = "function" - mock_symbol.file = "/test.py" - mock_symbol.range = (10, 20) - - mock_global_index.search.return_value = [mock_symbol] - mock_search_engine.search_references.return_value = [] - - # Disable include_related to test only the references call - provider.build_context("my_func", max_references=5, include_related=False) - - mock_search_engine.search_references.assert_called_once_with( - "my_func", limit=5 - ) - - def test_build_context_includes_metadata( - self, provider, mock_global_index, mock_search_engine - ): - """build_context includes source metadata.""" - mock_symbol = Mock() - mock_symbol.name = "my_func" - mock_symbol.kind = "function" - mock_symbol.file = "/test.py" - mock_symbol.range = (10, 20) - - mock_global_index.search.return_value = [mock_symbol] - mock_search_engine.search_references.return_value = [] - - result = provider.build_context("my_func") - - assert result is not None - assert result.metadata.get("source") == "codex-lens" - - def test_extract_definition_with_valid_file(self, provider): - """_extract_definition reads file content correctly.""" - # Create a temporary file with some content - with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: - f.write("# Line 1\n") - f.write("# Line 2\n") - f.write("def my_func():\n") # Line 3 - f.write(" pass\n") # Line 4 - f.write("# Line 5\n") - temp_path = f.name - - try: - mock_symbol = Mock() - mock_symbol.file = temp_path - mock_symbol.range = (3, 4) # 1-based line numbers - - definition = provider._extract_definition(mock_symbol) - - assert definition is not None - assert "def my_func():" in definition - assert "pass" in definition - finally: - os.unlink(temp_path) - - def test_extract_definition_returns_none_for_missing_file(self, provider): - """_extract_definition returns None for non-existent file.""" - mock_symbol = Mock() - mock_symbol.file = "/nonexistent/path/file.py" - mock_symbol.range = (1, 5) - - definition = provider._extract_definition(mock_symbol) - - assert definition is None - - def test_extract_definition_returns_none_for_none_file(self, provider): - """_extract_definition returns None when symbol.file is None.""" - mock_symbol = Mock() - mock_symbol.file = None - mock_symbol.range = (1, 5) - - definition = provider._extract_definition(mock_symbol) - - assert definition is None - - def test_build_context_for_file_returns_context( - self, provider, mock_global_index - ): - """build_context_for_file returns MCPContext.""" - mock_global_index.search.return_value = [] - - result = provider.build_context_for_file( - Path("/test/file.py"), - context_type="file_overview", - ) - - assert result is not None - assert isinstance(result, MCPContext) - assert result.context_type == "file_overview" - assert result.metadata.get("file_path") == str(Path("/test/file.py")) - - def test_build_context_for_file_includes_symbols( - self, provider, mock_global_index - ): - """build_context_for_file includes symbols from the file.""" - # Create temp file to get resolved path - with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: - f.write("def func(): pass\n") - temp_path = f.name - - try: - mock_symbol = Mock() - mock_symbol.name = "func" - mock_symbol.kind = "function" - mock_symbol.file = temp_path - mock_symbol.range = (1, 1) - - mock_global_index.search.return_value = [mock_symbol] - - result = provider.build_context_for_file(Path(temp_path)) - - assert result is not None - # Symbols from this file should be in related_symbols - assert len(result.related_symbols) >= 0 # May be 0 if filtering doesn't match - finally: - os.unlink(temp_path) - - -class TestMCPProviderRelatedSymbols: - """Test related symbols functionality.""" - - @pytest.fixture - def provider(self): - """Create provider with mocks.""" - mock_global_index = Mock() - mock_search_engine = Mock() - mock_registry = Mock() - return MCPProvider(mock_global_index, mock_search_engine, mock_registry) - - def test_get_related_symbols_from_references(self, provider): - """_get_related_symbols extracts symbols from references.""" - mock_symbol = Mock() - mock_symbol.name = "my_func" - mock_symbol.file = "/test.py" - - mock_ref1 = Mock() - mock_ref1.file_path = "/caller1.py" - mock_ref1.relationship_type = "call" - - mock_ref2 = Mock() - mock_ref2.file_path = "/caller2.py" - mock_ref2.relationship_type = "import" - - provider.search_engine.search_references.return_value = [mock_ref1, mock_ref2] - - related = provider._get_related_symbols(mock_symbol) - - assert len(related) == 2 - assert related[0].relationship == "call" - assert related[1].relationship == "import" - - def test_get_related_symbols_limits_results(self, provider): - """_get_related_symbols limits to 10 unique relationship types.""" - mock_symbol = Mock() - mock_symbol.name = "my_func" - mock_symbol.file = "/test.py" - - # Create 15 references with unique relationship types - refs = [] - for i in range(15): - ref = Mock() - ref.file_path = f"/file{i}.py" - ref.relationship_type = f"type{i}" - refs.append(ref) - - provider.search_engine.search_references.return_value = refs - - related = provider._get_related_symbols(mock_symbol) - - assert len(related) <= 10 - - def test_get_related_symbols_handles_exception(self, provider): - """_get_related_symbols handles exceptions gracefully.""" - mock_symbol = Mock() - mock_symbol.name = "my_func" - mock_symbol.file = "/test.py" - - provider.search_engine.search_references.side_effect = Exception("Search failed") - - related = provider._get_related_symbols(mock_symbol) - - assert related == [] - - -class TestMCPProviderIntegration: - """Integration-style tests for MCPProvider.""" - - def test_full_context_workflow(self): - """Test complete context building workflow.""" - # Create temp file - with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: - f.write("def my_function(arg1, arg2):\n") - f.write(" '''This is my function.'''\n") - f.write(" return arg1 + arg2\n") - temp_path = f.name - - try: - # Setup mocks - mock_global_index = Mock() - mock_search_engine = Mock() - mock_registry = Mock() - - mock_symbol = Mock() - mock_symbol.name = "my_function" - mock_symbol.kind = "function" - mock_symbol.file = temp_path - mock_symbol.range = (1, 3) - - mock_ref = Mock() - mock_ref.file_path = "/user.py" - mock_ref.line = 10 - mock_ref.column = 4 - mock_ref.context = "result = my_function(1, 2)" - mock_ref.relationship_type = "call" - - mock_global_index.search.return_value = [mock_symbol] - mock_search_engine.search_references.return_value = [mock_ref] - - provider = MCPProvider(mock_global_index, mock_search_engine, mock_registry) - context = provider.build_context("my_function") - - assert context is not None - assert context.symbol.name == "my_function" - assert context.definition is not None - assert "def my_function" in context.definition - assert len(context.references) == 1 - assert context.references[0].relationship_type == "call" - - # Test serialization - json_str = context.to_json() - assert "my_function" in json_str - - # Test prompt injection - prompt = context.to_prompt_injection() - assert "" in prompt - assert "my_function" in prompt - assert "" in prompt - - finally: - os.unlink(temp_path) diff --git a/codex-lens/tests/mcp/test_schema.py b/codex-lens/tests/mcp/test_schema.py deleted file mode 100644 index e5914593..00000000 --- a/codex-lens/tests/mcp/test_schema.py +++ /dev/null @@ -1,288 +0,0 @@ -"""Tests for MCP schema.""" - -import pytest -import json - -from codexlens.mcp.schema import ( - MCPContext, - SymbolInfo, - ReferenceInfo, - RelatedSymbol, -) - - -class TestSymbolInfo: - """Test SymbolInfo dataclass.""" - - def test_to_dict_includes_all_fields(self): - """SymbolInfo.to_dict() includes all non-None fields.""" - info = SymbolInfo( - name="func", - kind="function", - file_path="/test.py", - line_start=10, - line_end=20, - signature="def func():", - documentation="Test doc", - ) - d = info.to_dict() - assert d["name"] == "func" - assert d["kind"] == "function" - assert d["file_path"] == "/test.py" - assert d["line_start"] == 10 - assert d["line_end"] == 20 - assert d["signature"] == "def func():" - assert d["documentation"] == "Test doc" - - def test_to_dict_excludes_none(self): - """SymbolInfo.to_dict() excludes None fields.""" - info = SymbolInfo( - name="func", - kind="function", - file_path="/test.py", - line_start=10, - line_end=20, - ) - d = info.to_dict() - assert "signature" not in d - assert "documentation" not in d - assert "name" in d - assert "kind" in d - - def test_basic_creation(self): - """SymbolInfo can be created with required fields only.""" - info = SymbolInfo( - name="MyClass", - kind="class", - file_path="/src/module.py", - line_start=1, - line_end=50, - ) - assert info.name == "MyClass" - assert info.kind == "class" - assert info.signature is None - assert info.documentation is None - - -class TestReferenceInfo: - """Test ReferenceInfo dataclass.""" - - def test_to_dict(self): - """ReferenceInfo.to_dict() returns all fields.""" - ref = ReferenceInfo( - file_path="/src/main.py", - line=25, - column=4, - context="result = func()", - relationship_type="call", - ) - d = ref.to_dict() - assert d["file_path"] == "/src/main.py" - assert d["line"] == 25 - assert d["column"] == 4 - assert d["context"] == "result = func()" - assert d["relationship_type"] == "call" - - def test_all_fields_required(self): - """ReferenceInfo requires all fields.""" - ref = ReferenceInfo( - file_path="/test.py", - line=10, - column=0, - context="import module", - relationship_type="import", - ) - assert ref.file_path == "/test.py" - assert ref.relationship_type == "import" - - -class TestRelatedSymbol: - """Test RelatedSymbol dataclass.""" - - def test_to_dict_includes_all_fields(self): - """RelatedSymbol.to_dict() includes all non-None fields.""" - sym = RelatedSymbol( - name="BaseClass", - kind="class", - relationship="inherits", - file_path="/src/base.py", - ) - d = sym.to_dict() - assert d["name"] == "BaseClass" - assert d["kind"] == "class" - assert d["relationship"] == "inherits" - assert d["file_path"] == "/src/base.py" - - def test_to_dict_excludes_none(self): - """RelatedSymbol.to_dict() excludes None file_path.""" - sym = RelatedSymbol( - name="helper", - kind="function", - relationship="calls", - ) - d = sym.to_dict() - assert "file_path" not in d - assert d["name"] == "helper" - assert d["relationship"] == "calls" - - -class TestMCPContext: - """Test MCPContext dataclass.""" - - def test_to_dict_basic(self): - """MCPContext.to_dict() returns basic structure.""" - ctx = MCPContext(context_type="test") - d = ctx.to_dict() - assert d["version"] == "1.0" - assert d["context_type"] == "test" - assert d["metadata"] == {} - - def test_to_dict_with_symbol(self): - """MCPContext.to_dict() includes symbol when present.""" - ctx = MCPContext( - context_type="test", - symbol=SymbolInfo("f", "function", "/t.py", 1, 2), - ) - d = ctx.to_dict() - assert "symbol" in d - assert d["symbol"]["name"] == "f" - assert d["symbol"]["kind"] == "function" - - def test_to_dict_with_references(self): - """MCPContext.to_dict() includes references when present.""" - ctx = MCPContext( - context_type="test", - references=[ - ReferenceInfo("/a.py", 10, 0, "call()", "call"), - ReferenceInfo("/b.py", 20, 5, "import x", "import"), - ], - ) - d = ctx.to_dict() - assert "references" in d - assert len(d["references"]) == 2 - assert d["references"][0]["line"] == 10 - - def test_to_dict_with_related_symbols(self): - """MCPContext.to_dict() includes related_symbols when present.""" - ctx = MCPContext( - context_type="test", - related_symbols=[ - RelatedSymbol("Base", "class", "inherits"), - RelatedSymbol("helper", "function", "calls"), - ], - ) - d = ctx.to_dict() - assert "related_symbols" in d - assert len(d["related_symbols"]) == 2 - - def test_to_json(self): - """MCPContext.to_json() returns valid JSON.""" - ctx = MCPContext(context_type="test") - j = ctx.to_json() - parsed = json.loads(j) - assert parsed["version"] == "1.0" - assert parsed["context_type"] == "test" - - def test_to_json_with_indent(self): - """MCPContext.to_json() respects indent parameter.""" - ctx = MCPContext(context_type="test") - j = ctx.to_json(indent=4) - # Check it's properly indented - assert " " in j - - def test_to_prompt_injection_basic(self): - """MCPContext.to_prompt_injection() returns formatted string.""" - ctx = MCPContext( - symbol=SymbolInfo("my_func", "function", "/test.py", 10, 20), - definition="def my_func(): pass", - ) - prompt = ctx.to_prompt_injection() - assert "" in prompt - assert "my_func" in prompt - assert "def my_func()" in prompt - assert "" in prompt - - def test_to_prompt_injection_with_references(self): - """MCPContext.to_prompt_injection() includes references.""" - ctx = MCPContext( - symbol=SymbolInfo("func", "function", "/test.py", 1, 5), - references=[ - ReferenceInfo("/a.py", 10, 0, "func()", "call"), - ReferenceInfo("/b.py", 20, 0, "from x import func", "import"), - ], - ) - prompt = ctx.to_prompt_injection() - assert "References (2 found)" in prompt - assert "/a.py:10" in prompt - assert "call" in prompt - - def test_to_prompt_injection_limits_references(self): - """MCPContext.to_prompt_injection() limits references to 5.""" - refs = [ - ReferenceInfo(f"/file{i}.py", i, 0, f"ref{i}", "call") - for i in range(10) - ] - ctx = MCPContext( - symbol=SymbolInfo("func", "function", "/test.py", 1, 5), - references=refs, - ) - prompt = ctx.to_prompt_injection() - # Should show "10 found" but only include 5 - assert "References (10 found)" in prompt - assert "/file0.py" in prompt - assert "/file4.py" in prompt - assert "/file5.py" not in prompt - - def test_to_prompt_injection_with_related_symbols(self): - """MCPContext.to_prompt_injection() includes related symbols.""" - ctx = MCPContext( - symbol=SymbolInfo("MyClass", "class", "/test.py", 1, 50), - related_symbols=[ - RelatedSymbol("BaseClass", "class", "inherits"), - RelatedSymbol("helper", "function", "calls"), - ], - ) - prompt = ctx.to_prompt_injection() - assert "Related Symbols" in prompt - assert "BaseClass (inherits)" in prompt - assert "helper (calls)" in prompt - - def test_to_prompt_injection_limits_related_symbols(self): - """MCPContext.to_prompt_injection() limits related symbols to 10.""" - related = [ - RelatedSymbol(f"sym{i}", "function", "calls") - for i in range(15) - ] - ctx = MCPContext( - symbol=SymbolInfo("func", "function", "/test.py", 1, 5), - related_symbols=related, - ) - prompt = ctx.to_prompt_injection() - assert "sym0 (calls)" in prompt - assert "sym9 (calls)" in prompt - assert "sym10 (calls)" not in prompt - - def test_empty_context(self): - """MCPContext works with minimal data.""" - ctx = MCPContext() - d = ctx.to_dict() - assert d["version"] == "1.0" - assert d["context_type"] == "code_context" - - prompt = ctx.to_prompt_injection() - assert "" in prompt - assert "" in prompt - - def test_metadata_preserved(self): - """MCPContext preserves custom metadata.""" - ctx = MCPContext( - context_type="custom", - metadata={ - "source": "codex-lens", - "indexed_at": "2024-01-01", - "custom_key": "custom_value", - }, - ) - d = ctx.to_dict() - assert d["metadata"]["source"] == "codex-lens" - assert d["metadata"]["custom_key"] == "custom_value" diff --git a/codex-lens/tests/parsers/__init__.py b/codex-lens/tests/parsers/__init__.py deleted file mode 100644 index 0e066f0e..00000000 --- a/codex-lens/tests/parsers/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for codexlens.parsers modules.""" diff --git a/codex-lens/tests/parsers/test_astgrep_extraction.py b/codex-lens/tests/parsers/test_astgrep_extraction.py deleted file mode 100644 index 41f0d2ea..00000000 --- a/codex-lens/tests/parsers/test_astgrep_extraction.py +++ /dev/null @@ -1,444 +0,0 @@ -"""Tests for dedicated extraction methods: extract_inherits, extract_calls, extract_imports. - -Tests pattern-based relationship extraction from Python source code -using ast-grep-py bindings for INHERITS, CALL, and IMPORTS relationships. -""" - -from pathlib import Path - -import pytest - -from codexlens.parsers.astgrep_processor import ( - AstGrepPythonProcessor, - is_astgrep_processor_available, -) -from codexlens.entities import RelationshipType - - -# Check if ast-grep is available for conditional test skipping -ASTGREP_AVAILABLE = is_astgrep_processor_available() - - -@pytest.mark.skipif(not ASTGREP_AVAILABLE, reason="ast-grep-py not installed") -class TestExtractInherits: - """Tests for extract_inherits method - INHERITS relationship extraction.""" - - def test_single_inheritance(self): - """Test extraction of single inheritance relationship.""" - processor = AstGrepPythonProcessor() - code = """ -class Animal: - pass - -class Dog(Animal): - pass -""" - relationships = processor.extract_inherits(code, "test.py") - - assert len(relationships) == 1 - rel = relationships[0] - assert rel.source_symbol == "Dog" - assert rel.target_symbol == "Animal" - assert rel.relationship_type == RelationshipType.INHERITS - - def test_multiple_inheritance(self): - """Test extraction of multiple inheritance relationships.""" - processor = AstGrepPythonProcessor() - code = """ -class A: - pass - -class B: - pass - -class C(A, B): - pass -""" - relationships = processor.extract_inherits(code, "test.py") - - # Should have 2 relationships: C->A and C->B - assert len(relationships) == 2 - targets = {r.target_symbol for r in relationships} - assert "A" in targets - assert "B" in targets - for rel in relationships: - assert rel.source_symbol == "C" - - def test_no_inheritance(self): - """Test that classes without inheritance return empty list.""" - processor = AstGrepPythonProcessor() - code = """ -class Standalone: - pass -""" - relationships = processor.extract_inherits(code, "test.py") - - assert len(relationships) == 0 - - def test_nested_class_inheritance(self): - """Test extraction of inheritance in nested classes.""" - processor = AstGrepPythonProcessor() - code = """ -class Outer: - class Inner(Base): - pass -""" - relationships = processor.extract_inherits(code, "test.py") - - assert len(relationships) == 1 - assert relationships[0].source_symbol == "Inner" - assert relationships[0].target_symbol == "Base" - - def test_inheritance_with_complex_bases(self): - """Test extraction with generic or complex base classes.""" - processor = AstGrepPythonProcessor() - code = """ -class Service(BaseService, mixins.Loggable): - pass -""" - relationships = processor.extract_inherits(code, "test.py") - - assert len(relationships) == 2 - targets = {r.target_symbol for r in relationships} - assert "BaseService" in targets - assert "mixins.Loggable" in targets - - -@pytest.mark.skipif(not ASTGREP_AVAILABLE, reason="ast-grep-py not installed") -class TestExtractCalls: - """Tests for extract_calls method - CALL relationship extraction.""" - - def test_simple_function_call(self): - """Test extraction of simple function calls.""" - processor = AstGrepPythonProcessor() - code = """ -def main(): - print("hello") - len([1, 2, 3]) -""" - relationships = processor.extract_calls(code, "test.py", "main") - - targets = {r.target_symbol for r in relationships} - assert "print" in targets - assert "len" in targets - - def test_method_call(self): - """Test extraction of method calls.""" - processor = AstGrepPythonProcessor() - code = """ -def process(): - obj.method() - items.append(1) -""" - relationships = processor.extract_calls(code, "test.py", "process") - - targets = {r.target_symbol for r in relationships} - assert "obj.method" in targets - assert "items.append" in targets - - def test_skips_self_calls(self): - """Test that self.method() calls are filtered.""" - processor = AstGrepPythonProcessor() - code = """ -class Service: - def process(self): - self.internal() - external_func() -""" - relationships = processor.extract_calls(code, "test.py", "Service") - - targets = {r.target_symbol for r in relationships} - # self.internal should be filtered - assert "self.internal" not in targets - assert "internal" not in targets - assert "external_func" in targets - - def test_skips_cls_calls(self): - """Test that cls.method() calls are filtered.""" - processor = AstGrepPythonProcessor() - code = """ -class Factory: - @classmethod - def create(cls): - cls.helper() - other_func() -""" - relationships = processor.extract_calls(code, "test.py", "Factory") - - targets = {r.target_symbol for r in relationships} - assert "cls.helper" not in targets - assert "other_func" in targets - - def test_alias_resolution(self): - """Test call alias resolution using import map.""" - processor = AstGrepPythonProcessor() - code = """ -def main(): - np.array([1, 2, 3]) -""" - alias_map = {"np": "numpy"} - relationships = processor.extract_calls(code, "test.py", "main", alias_map) - - assert len(relationships) >= 1 - # Should resolve np.array to numpy.array - assert any("numpy.array" in r.target_symbol for r in relationships) - - def test_no_calls(self): - """Test that code without calls returns empty list.""" - processor = AstGrepPythonProcessor() - code = """ -x = 1 -y = x + 2 -""" - relationships = processor.extract_calls(code, "test.py") - - assert len(relationships) == 0 - - -@pytest.mark.skipif(not ASTGREP_AVAILABLE, reason="ast-grep-py not installed") -class TestExtractImports: - """Tests for extract_imports method - IMPORTS relationship extraction.""" - - def test_simple_import(self): - """Test extraction of simple import statements.""" - processor = AstGrepPythonProcessor() - code = "import os" - - relationships, alias_map = processor.extract_imports(code, "test.py") - - assert len(relationships) == 1 - assert relationships[0].target_symbol == "os" - assert relationships[0].relationship_type == RelationshipType.IMPORTS - assert alias_map.get("os") == "os" - - def test_import_with_alias(self): - """Test extraction of import with alias.""" - processor = AstGrepPythonProcessor() - code = "import numpy as np" - - relationships, alias_map = processor.extract_imports(code, "test.py") - - assert len(relationships) == 1 - assert relationships[0].target_symbol == "numpy" - assert alias_map.get("np") == "numpy" - - def test_from_import(self): - """Test extraction of from-import statements.""" - processor = AstGrepPythonProcessor() - code = "from typing import List, Dict" - - relationships, alias_map = processor.extract_imports(code, "test.py") - - assert len(relationships) == 1 - assert relationships[0].target_symbol == "typing" - assert alias_map.get("List") == "typing.List" - assert alias_map.get("Dict") == "typing.Dict" - - def test_from_import_with_alias(self): - """Test extraction of from-import with alias.""" - processor = AstGrepPythonProcessor() - code = "from collections import defaultdict as dd" - - relationships, alias_map = processor.extract_imports(code, "test.py") - - assert len(relationships) == 1 - # The alias map should map dd to collections.defaultcount - assert "dd" in alias_map - assert "defaultdict" in alias_map.get("dd", "") - - def test_star_import(self): - """Test extraction of star imports.""" - processor = AstGrepPythonProcessor() - code = "from module import *" - - relationships, alias_map = processor.extract_imports(code, "test.py") - - assert len(relationships) >= 1 - # Star import should be recorded - star_imports = [r for r in relationships if "*" in r.target_symbol] - assert len(star_imports) >= 1 - - def test_relative_import(self): - """Test extraction of relative imports.""" - processor = AstGrepPythonProcessor() - code = "from .utils import helper" - - relationships, alias_map = processor.extract_imports(code, "test.py") - - # Should capture the relative import - assert len(relationships) >= 1 - rel_imports = [r for r in relationships if r.target_symbol.startswith(".")] - assert len(rel_imports) >= 1 - - def test_multiple_imports(self): - """Test extraction of multiple import types.""" - processor = AstGrepPythonProcessor() - code = """ -import os -import sys -from typing import List -from collections import defaultdict as dd -""" - - relationships, alias_map = processor.extract_imports(code, "test.py") - - assert len(relationships) >= 4 - targets = {r.target_symbol for r in relationships} - assert "os" in targets - assert "sys" in targets - assert "typing" in targets - assert "collections" in targets - - def test_no_imports(self): - """Test that code without imports returns empty list.""" - processor = AstGrepPythonProcessor() - code = """ -x = 1 -def foo(): - pass -""" - relationships, alias_map = processor.extract_imports(code, "test.py") - - assert len(relationships) == 0 - assert len(alias_map) == 0 - - -@pytest.mark.skipif(not ASTGREP_AVAILABLE, reason="ast-grep-py not installed") -class TestExtractMethodsIntegration: - """Integration tests combining multiple extraction methods.""" - - def test_full_file_extraction(self): - """Test extracting all relationships from a complete file.""" - processor = AstGrepPythonProcessor() - code = """ -import os -from typing import List, Optional - -class Base: - pass - -class Service(Base): - def __init__(self): - self.data = [] - - def process(self): - result = os.path.join("a", "b") - items = List([1, 2, 3]) - return result - -def main(): - svc = Service() - svc.process() -""" - source_file = "test.py" - - # Extract all relationship types - imports, alias_map = processor.extract_imports(code, source_file) - inherits = processor.extract_inherits(code, source_file) - calls = processor.extract_calls(code, source_file, alias_map=alias_map) - - # Verify we got all expected relationships - assert len(imports) >= 2 # os and typing - assert len(inherits) == 1 # Service -> Base - assert len(calls) >= 2 # os.path.join and others - - # Verify inheritance - assert any(r.source_symbol == "Service" and r.target_symbol == "Base" - for r in inherits) - - def test_alias_propagation(self): - """Test that import aliases propagate to call resolution.""" - processor = AstGrepPythonProcessor() - code = """ -import numpy as np - -def compute(): - arr = np.array([1, 2, 3]) - return np.sum(arr) -""" - source_file = "test.py" - - imports, alias_map = processor.extract_imports(code, source_file) - calls = processor.extract_calls(code, source_file, alias_map=alias_map) - - # Alias map should have np -> numpy - assert alias_map.get("np") == "numpy" - - # Calls should resolve np.array and np.sum - resolved_targets = {r.target_symbol for r in calls} - # At minimum, np.array and np.sum should be captured - np_calls = [t for t in resolved_targets if "np" in t or "numpy" in t] - assert len(np_calls) >= 2 - - -class TestExtractMethodFallback: - """Tests for fallback behavior when ast-grep unavailable.""" - - def test_extract_inherits_empty_when_unavailable(self): - """Test extract_inherits returns empty list when unavailable.""" - processor = AstGrepPythonProcessor() - if not processor.is_available(): - code = "class Dog(Animal): pass" - relationships = processor.extract_inherits(code, "test.py") - assert relationships == [] - - def test_extract_calls_empty_when_unavailable(self): - """Test extract_calls returns empty list when unavailable.""" - processor = AstGrepPythonProcessor() - if not processor.is_available(): - code = "print('hello')" - relationships = processor.extract_calls(code, "test.py") - assert relationships == [] - - def test_extract_imports_empty_when_unavailable(self): - """Test extract_imports returns empty tuple when unavailable.""" - processor = AstGrepPythonProcessor() - if not processor.is_available(): - code = "import os" - relationships, alias_map = processor.extract_imports(code, "test.py") - assert relationships == [] - assert alias_map == {} - - -class TestHelperMethods: - """Tests for internal helper methods.""" - - def test_parse_base_classes_single(self): - """Test _parse_base_classes with single base.""" - processor = AstGrepPythonProcessor() - result = processor._parse_base_classes("BaseClass") - assert result == ["BaseClass"] - - def test_parse_base_classes_multiple(self): - """Test _parse_base_classes with multiple bases.""" - processor = AstGrepPythonProcessor() - result = processor._parse_base_classes("A, B, C") - assert result == ["A", "B", "C"] - - def test_parse_base_classes_with_generics(self): - """Test _parse_base_classes with generic types.""" - processor = AstGrepPythonProcessor() - result = processor._parse_base_classes("Generic[T], Mixin") - assert "Generic[T]" in result - assert "Mixin" in result - - def test_resolve_call_alias_simple(self): - """Test _resolve_call_alias with simple name.""" - processor = AstGrepPythonProcessor() - alias_map = {"np": "numpy"} - result = processor._resolve_call_alias("np", alias_map) - assert result == "numpy" - - def test_resolve_call_alias_qualified(self): - """Test _resolve_call_alias with qualified name.""" - processor = AstGrepPythonProcessor() - alias_map = {"np": "numpy"} - result = processor._resolve_call_alias("np.array", alias_map) - assert result == "numpy.array" - - def test_resolve_call_alias_no_match(self): - """Test _resolve_call_alias when no alias exists.""" - processor = AstGrepPythonProcessor() - alias_map = {} - result = processor._resolve_call_alias("myfunc", alias_map) - assert result == "myfunc" diff --git a/codex-lens/tests/parsers/test_astgrep_processor.py b/codex-lens/tests/parsers/test_astgrep_processor.py deleted file mode 100644 index ba10b83e..00000000 --- a/codex-lens/tests/parsers/test_astgrep_processor.py +++ /dev/null @@ -1,402 +0,0 @@ -"""Tests for AstGrepPythonProcessor. - -Tests pattern-based relationship extraction from Python source code -using ast-grep-py bindings. -""" - -from pathlib import Path - -import pytest - -from codexlens.parsers.astgrep_processor import ( - AstGrepPythonProcessor, - BaseAstGrepProcessor, - is_astgrep_processor_available, -) -from codexlens.parsers.patterns.python import ( - PYTHON_PATTERNS, - METAVARS, - RELATIONSHIP_PATTERNS, - get_pattern, - get_patterns_for_relationship, - get_metavar, -) - - -# Check if ast-grep is available for conditional test skipping -ASTGREP_AVAILABLE = is_astgrep_processor_available() - - -class TestPatternDefinitions: - """Tests for Python pattern definitions.""" - - def test_python_patterns_exist(self): - """Verify all expected patterns are defined.""" - expected_patterns = [ - "class_def", - "class_with_bases", - "func_def", - "async_func_def", - "import_stmt", - "import_from", - "call", - "method_call", - ] - for pattern_name in expected_patterns: - assert pattern_name in PYTHON_PATTERNS, f"Missing pattern: {pattern_name}" - - def test_get_pattern_returns_correct_pattern(self): - """Test get_pattern returns expected pattern strings.""" - # Note: ast-grep-py 0.40+ uses $$$ for zero-or-more multi-match - assert get_pattern("class_def") == "class $NAME $$$BODY" - assert get_pattern("func_def") == "def $NAME($$$PARAMS): $$$BODY" - assert get_pattern("import_stmt") == "import $MODULE" - - def test_get_pattern_raises_for_unknown(self): - """Test get_pattern raises KeyError for unknown patterns.""" - with pytest.raises(KeyError): - get_pattern("nonexistent_pattern") - - def test_metavars_defined(self): - """Verify metavariable mappings are defined.""" - expected_metavars = [ - "class_name", - "func_name", - "import_module", - "call_func", - ] - for var in expected_metavars: - assert var in METAVARS, f"Missing metavar: {var}" - - def test_get_metavar(self): - """Test get_metavar returns correct values.""" - assert get_metavar("class_name") == "NAME" - assert get_metavar("func_name") == "NAME" - assert get_metavar("import_module") == "MODULE" - - def test_relationship_patterns_mapping(self): - """Test relationship type to pattern mapping.""" - assert "class_with_bases" in get_patterns_for_relationship("inheritance") - assert "import_stmt" in get_patterns_for_relationship("imports") - assert "import_from" in get_patterns_for_relationship("imports") - assert "call" in get_patterns_for_relationship("calls") - - -class TestAstGrepPythonProcessorAvailability: - """Tests for processor availability.""" - - def test_is_available_returns_bool(self): - """Test is_available returns a boolean.""" - processor = AstGrepPythonProcessor() - assert isinstance(processor.is_available(), bool) - - def test_is_available_matches_global_check(self): - """Test is_available matches is_astgrep_processor_available.""" - processor = AstGrepPythonProcessor() - assert processor.is_available() == is_astgrep_processor_available() - - def test_module_level_check(self): - """Test module-level availability function.""" - assert isinstance(is_astgrep_processor_available(), bool) - - -@pytest.mark.skipif(not ASTGREP_AVAILABLE, reason="ast-grep-py not installed") -class TestAstGrepPythonProcessorParsing: - """Tests for Python parsing with ast-grep.""" - - def test_parse_simple_function(self): - """Test parsing a simple function definition.""" - processor = AstGrepPythonProcessor() - code = "def hello():\n pass" - result = processor.parse(code, Path("test.py")) - - assert result is not None - assert result.language == "python" - assert len(result.symbols) == 1 - assert result.symbols[0].name == "hello" - assert result.symbols[0].kind == "function" - - def test_parse_class(self): - """Test parsing a class definition.""" - processor = AstGrepPythonProcessor() - code = "class MyClass:\n pass" - result = processor.parse(code, Path("test.py")) - - assert result is not None - assert len(result.symbols) == 1 - assert result.symbols[0].name == "MyClass" - assert result.symbols[0].kind == "class" - - def test_parse_async_function(self): - """Test parsing an async function definition.""" - processor = AstGrepPythonProcessor() - code = "async def fetch_data():\n pass" - result = processor.parse(code, Path("test.py")) - - assert result is not None - assert len(result.symbols) == 1 - assert result.symbols[0].name == "fetch_data" - - def test_parse_class_with_inheritance(self): - """Test parsing class with inheritance.""" - processor = AstGrepPythonProcessor() - code = """ -class Base: - pass - -class Child(Base): - pass -""" - result = processor.parse(code, Path("test.py")) - - assert result is not None - names = [s.name for s in result.symbols] - assert "Base" in names - assert "Child" in names - - # Check inheritance relationship - inherits = [ - r for r in result.relationships - if r.relationship_type.value == "inherits" - ] - assert any(r.source_symbol == "Child" for r in inherits) - - def test_parse_imports(self): - """Test parsing import statements.""" - processor = AstGrepPythonProcessor() - code = """ -import os -from sys import path -""" - result = processor.parse(code, Path("test.py")) - - assert result is not None - imports = [ - r for r in result.relationships - if r.relationship_type.value == "imports" - ] - assert len(imports) >= 1 - targets = {r.target_symbol for r in imports} - assert "os" in targets - - def test_parse_function_calls(self): - """Test parsing function calls.""" - processor = AstGrepPythonProcessor() - code = """ -def main(): - print("hello") - len([1, 2, 3]) -""" - result = processor.parse(code, Path("test.py")) - - assert result is not None - calls = [ - r for r in result.relationships - if r.relationship_type.value == "calls" - ] - targets = {r.target_symbol for r in calls} - assert "print" in targets - assert "len" in targets - - def test_parse_empty_file(self): - """Test parsing an empty file.""" - processor = AstGrepPythonProcessor() - result = processor.parse("", Path("test.py")) - - assert result is not None - assert len(result.symbols) == 0 - - def test_parse_returns_indexed_file(self): - """Test that parse returns proper IndexedFile structure.""" - processor = AstGrepPythonProcessor() - code = "def test():\n pass" - result = processor.parse(code, Path("test.py")) - - assert result is not None - assert result.path.endswith("test.py") - assert result.language == "python" - assert isinstance(result.symbols, list) - assert isinstance(result.chunks, list) - assert isinstance(result.relationships, list) - - -@pytest.mark.skipif(not ASTGREP_AVAILABLE, reason="ast-grep-py not installed") -class TestAstGrepPythonProcessorRelationships: - """Tests for relationship extraction.""" - - def test_inheritance_extraction(self): - """Test extraction of inheritance relationships.""" - processor = AstGrepPythonProcessor() - code = """ -class Animal: - pass - -class Dog(Animal): - pass - -class Cat(Animal): - pass -""" - result = processor.parse(code, Path("test.py")) - - assert result is not None - inherits = [ - r for r in result.relationships - if r.relationship_type.value == "inherits" - ] - # Should have 2 inheritance relationships - assert len(inherits) >= 2 - sources = {r.source_symbol for r in inherits} - assert "Dog" in sources - assert "Cat" in sources - - def test_call_extraction_skips_self(self): - """Test that self.method() calls are filtered.""" - processor = AstGrepPythonProcessor() - code = """ -class Service: - def process(self): - self.internal() - external_call() - -def external_call(): - pass -""" - result = processor.parse(code, Path("test.py")) - - assert result is not None - calls = [ - r for r in result.relationships - if r.relationship_type.value == "calls" - ] - targets = {r.target_symbol for r in calls} - # self.internal should be filtered - assert "self.internal" not in targets - assert "external_call" in targets - - def test_import_with_alias_resolution(self): - """Test import alias resolution in calls.""" - processor = AstGrepPythonProcessor() - code = """ -import os.path as osp - -def main(): - osp.join("a", "b") -""" - result = processor.parse(code, Path("test.py")) - - assert result is not None - calls = [ - r for r in result.relationships - if r.relationship_type.value == "calls" - ] - targets = {r.target_symbol for r in calls} - # Should resolve osp to os.path - assert any("os.path" in t for t in targets) - - -@pytest.mark.skipif(not ASTGREP_AVAILABLE, reason="ast-grep-py not installed") -class TestAstGrepPythonProcessorRunAstGrep: - """Tests for run_ast_grep method.""" - - def test_run_ast_grep_returns_list(self): - """Test run_ast_grep returns a list.""" - processor = AstGrepPythonProcessor() - code = "def hello():\n pass" - processor._binding.parse(code) if processor._binding else None - - matches = processor.run_ast_grep(code, "def $NAME($$$PARAMS) $$$BODY") - assert isinstance(matches, list) - - def test_run_ast_grep_finds_matches(self): - """Test run_ast_grep finds expected matches.""" - processor = AstGrepPythonProcessor() - code = "def hello():\n pass" - - matches = processor.run_ast_grep(code, "def $NAME($$$PARAMS) $$$BODY") - assert len(matches) >= 1 - - def test_run_ast_grep_empty_code(self): - """Test run_ast_grep with empty code.""" - processor = AstGrepPythonProcessor() - matches = processor.run_ast_grep("", "def $NAME($$$PARAMS) $$$BODY") - assert matches == [] - - def test_run_ast_grep_no_matches(self): - """Test run_ast_grep when pattern doesn't match.""" - processor = AstGrepPythonProcessor() - code = "x = 1" - matches = processor.run_ast_grep(code, "class $NAME $$$BODY") - assert matches == [] - - -class TestAstGrepPythonProcessorFallback: - """Tests for fallback behavior when ast-grep unavailable.""" - - def test_parse_returns_none_when_unavailable(self): - """Test parse returns None when ast-grep unavailable.""" - # This test runs regardless of availability - # When unavailable, should gracefully return None - processor = AstGrepPythonProcessor() - if not processor.is_available(): - code = "def test():\n pass" - result = processor.parse(code, Path("test.py")) - assert result is None - - def test_run_ast_grep_empty_when_unavailable(self): - """Test run_ast_grep returns empty list when unavailable.""" - processor = AstGrepPythonProcessor() - if not processor.is_available(): - matches = processor.run_ast_grep("code", "pattern") - assert matches == [] - - -class TestBaseAstGrepProcessor: - """Tests for abstract base class.""" - - def test_cannot_instantiate_base_class(self): - """Test that BaseAstGrepProcessor cannot be instantiated directly.""" - with pytest.raises(TypeError): - BaseAstGrepProcessor("python") # type: ignore[abstract] - - def test_subclass_implements_abstract_methods(self): - """Test that AstGrepPythonProcessor implements all abstract methods.""" - processor = AstGrepPythonProcessor() - # Should have process_matches method - assert hasattr(processor, "process_matches") - # Should have parse method - assert hasattr(processor, "parse") - # Check methods are callable - assert callable(processor.process_matches) - assert callable(processor.parse) - - -class TestPatternIntegration: - """Tests for pattern module integration with processor.""" - - def test_processor_uses_pattern_module(self): - """Verify processor uses patterns from pattern module.""" - # The processor should import and use patterns from patterns/python/ - from codexlens.parsers.astgrep_processor import get_pattern - - # Verify pattern access works - assert get_pattern("class_def") is not None - assert get_pattern("func_def") is not None - - def test_pattern_consistency(self): - """Test pattern definitions are consistent.""" - # Patterns used by processor should exist in pattern module - patterns_needed = [ - "class_def", - "class_with_bases", - "func_def", - "async_func_def", - "import_stmt", - "import_from", - "call", - ] - for pattern_name in patterns_needed: - # Should not raise KeyError - pattern = get_pattern(pattern_name) - assert pattern is not None - assert len(pattern) > 0 diff --git a/codex-lens/tests/parsers/test_comparison.py b/codex-lens/tests/parsers/test_comparison.py deleted file mode 100644 index 9c9840a0..00000000 --- a/codex-lens/tests/parsers/test_comparison.py +++ /dev/null @@ -1,525 +0,0 @@ -"""Comparison tests for tree-sitter vs ast-grep Python relationship extraction. - -Validates that both parsers produce consistent output for Python relationship -extraction (INHERITS, CALL, IMPORTS). -""" - -from __future__ import annotations - -from pathlib import Path -from typing import List, Set, Tuple - -import pytest - -from codexlens.config import Config -from codexlens.entities import CodeRelationship, RelationshipType -from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser - - -# Sample Python code for testing relationship extraction -SAMPLE_PYTHON_CODE = ''' -"""Module docstring.""" -import os -import sys -from typing import List, Dict, Optional -from collections import defaultdict as dd -from pathlib import Path as PPath - -class BaseClass: - """Base class.""" - - def base_method(self): - pass - - def another_method(self): - return self.base_method() - - -class Mixin: - """Mixin class.""" - - def mixin_func(self): - return "mixin" - - -class ChildClass(BaseClass, Mixin): - """Child class with multiple inheritance.""" - - def __init__(self): - super().__init__() - self.data = dd(list) - - def process(self, items: List[str]) -> Dict[str, int]: - result = {} - for item in items: - result[item] = len(item) - return result - - def call_external(self, path: str) -> Optional[str]: - p = PPath(path) - if p.exists(): - return str(p.read_text()) - return None - - -def standalone_function(): - """Standalone function.""" - data = [1, 2, 3] - return sum(data) - - -async def async_function(): - """Async function.""" - import asyncio - await asyncio.sleep(1) -''' - - -def relationship_to_tuple(rel: CodeRelationship) -> Tuple[str, str, str, int]: - """Convert relationship to a comparable tuple. - - Returns: - (source_symbol, target_symbol, relationship_type, source_line) - """ - return ( - rel.source_symbol, - rel.target_symbol, - rel.relationship_type.value, - rel.source_line, - ) - - -def extract_relationship_tuples( - relationships: List[CodeRelationship], -) -> Set[Tuple[str, str, str]]: - """Extract relationship tuples without line numbers for comparison. - - Returns: - Set of (source_symbol, target_symbol, relationship_type) tuples - """ - return { - (rel.source_symbol, rel.target_symbol, rel.relationship_type.value) - for rel in relationships - } - - -def filter_by_type( - relationships: List[CodeRelationship], - rel_type: RelationshipType, -) -> List[CodeRelationship]: - """Filter relationships by type.""" - return [r for r in relationships if r.relationship_type == rel_type] - - -class TestTreeSitterVsAstGrepComparison: - """Compare tree-sitter and ast-grep Python relationship extraction.""" - - @pytest.fixture - def sample_path(self, tmp_path: Path) -> Path: - """Create a temporary Python file with sample code.""" - py_file = tmp_path / "sample.py" - py_file.write_text(SAMPLE_PYTHON_CODE) - return py_file - - @pytest.fixture - def ts_parser_default(self) -> TreeSitterSymbolParser: - """Create tree-sitter parser with default config (use_astgrep=False).""" - config = Config() - assert config.use_astgrep is False - return TreeSitterSymbolParser("python", config=config) - - @pytest.fixture - def ts_parser_astgrep(self) -> TreeSitterSymbolParser: - """Create tree-sitter parser with ast-grep enabled.""" - config = Config() - config.use_astgrep = True - return TreeSitterSymbolParser("python", config=config) - - def test_parser_availability(self, ts_parser_default: TreeSitterSymbolParser) -> None: - """Test that tree-sitter parser is available.""" - assert ts_parser_default.is_available() - - def test_astgrep_processor_initialization( - self, ts_parser_astgrep: TreeSitterSymbolParser - ) -> None: - """Test that ast-grep processor is initialized when config enables it.""" - # The processor should be initialized (may be None if ast-grep-py not installed) - # This test just verifies the initialization path works - assert ts_parser_astgrep._config is not None - assert ts_parser_astgrep._config.use_astgrep is True - - def _skip_if_astgrep_unavailable( - self, ts_parser_astgrep: TreeSitterSymbolParser - ) -> None: - """Skip test if ast-grep is not available.""" - if ts_parser_astgrep._astgrep_processor is None: - pytest.skip("ast-grep-py not installed") - - def test_parse_returns_valid_result( - self, - ts_parser_default: TreeSitterSymbolParser, - sample_path: Path, - ) -> None: - """Test that parsing returns a valid IndexedFile.""" - source_code = sample_path.read_text() - result = ts_parser_default.parse(source_code, sample_path) - - assert result is not None - assert result.language == "python" - assert len(result.symbols) > 0 - assert len(result.relationships) > 0 - - def test_extracted_symbols_match( - self, - ts_parser_default: TreeSitterSymbolParser, - ts_parser_astgrep: TreeSitterSymbolParser, - sample_path: Path, - ) -> None: - """Test that both parsers extract similar symbols.""" - self._skip_if_astgrep_unavailable(ts_parser_astgrep) - - source_code = sample_path.read_text() - - result_ts = ts_parser_default.parse(source_code, sample_path) - result_astgrep = ts_parser_astgrep.parse(source_code, sample_path) - - assert result_ts is not None - assert result_astgrep is not None - - # Compare symbol names - ts_symbols = {s.name for s in result_ts.symbols} - astgrep_symbols = {s.name for s in result_astgrep.symbols} - - # Should have the same symbols (classes, functions, methods) - assert ts_symbols == astgrep_symbols - - def test_inheritance_relationships( - self, - ts_parser_default: TreeSitterSymbolParser, - ts_parser_astgrep: TreeSitterSymbolParser, - sample_path: Path, - ) -> None: - """Test INHERITS relationship extraction consistency.""" - self._skip_if_astgrep_unavailable(ts_parser_astgrep) - - source_code = sample_path.read_text() - - result_ts = ts_parser_default.parse(source_code, sample_path) - result_astgrep = ts_parser_astgrep.parse(source_code, sample_path) - - assert result_ts is not None - assert result_astgrep is not None - - # Extract inheritance relationships - ts_inherits = filter_by_type(result_ts.relationships, RelationshipType.INHERITS) - astgrep_inherits = filter_by_type( - result_astgrep.relationships, RelationshipType.INHERITS - ) - - ts_tuples = extract_relationship_tuples(ts_inherits) - astgrep_tuples = extract_relationship_tuples(astgrep_inherits) - - # Both should detect ChildClass(BaseClass, Mixin) - assert ts_tuples == astgrep_tuples - - # Verify specific inheritance relationships - expected_inherits = { - ("ChildClass", "BaseClass", "inherits"), - ("ChildClass", "Mixin", "inherits"), - } - assert ts_tuples == expected_inherits - - def test_import_relationships( - self, - ts_parser_default: TreeSitterSymbolParser, - ts_parser_astgrep: TreeSitterSymbolParser, - sample_path: Path, - ) -> None: - """Test IMPORTS relationship extraction consistency.""" - self._skip_if_astgrep_unavailable(ts_parser_astgrep) - - source_code = sample_path.read_text() - - result_ts = ts_parser_default.parse(source_code, sample_path) - result_astgrep = ts_parser_astgrep.parse(source_code, sample_path) - - assert result_ts is not None - assert result_astgrep is not None - - # Extract import relationships - ts_imports = filter_by_type(result_ts.relationships, RelationshipType.IMPORTS) - astgrep_imports = filter_by_type( - result_astgrep.relationships, RelationshipType.IMPORTS - ) - - ts_tuples = extract_relationship_tuples(ts_imports) - astgrep_tuples = extract_relationship_tuples(astgrep_imports) - - # Compare - should be similar (may differ in exact module representation) - # At minimum, both should detect the top-level imports - ts_modules = {t[1].split(".")[0] for t in ts_tuples} - astgrep_modules = {t[1].split(".")[0] for t in astgrep_tuples} - - # Should have imports from: os, sys, typing, collections, pathlib - expected_modules = {"os", "sys", "typing", "collections", "pathlib", "asyncio"} - assert ts_modules >= expected_modules or astgrep_modules >= expected_modules - - def test_call_relationships( - self, - ts_parser_default: TreeSitterSymbolParser, - ts_parser_astgrep: TreeSitterSymbolParser, - sample_path: Path, - ) -> None: - """Test CALL relationship extraction consistency.""" - self._skip_if_astgrep_unavailable(ts_parser_astgrep) - - source_code = sample_path.read_text() - - result_ts = ts_parser_default.parse(source_code, sample_path) - result_astgrep = ts_parser_astgrep.parse(source_code, sample_path) - - assert result_ts is not None - assert result_astgrep is not None - - # Extract call relationships - ts_calls = filter_by_type(result_ts.relationships, RelationshipType.CALL) - astgrep_calls = filter_by_type( - result_astgrep.relationships, RelationshipType.CALL - ) - - # Calls may differ due to scope tracking differences - # Just verify both parsers find call relationships - assert len(ts_calls) > 0 - assert len(astgrep_calls) > 0 - - # Verify specific calls that should be detected - ts_call_targets = {r.target_symbol for r in ts_calls} - astgrep_call_targets = {r.target_symbol for r in astgrep_calls} - - # Both should detect at least some common calls - # (exact match not required due to scope tracking differences) - common_targets = ts_call_targets & astgrep_call_targets - assert len(common_targets) > 0 - - def test_relationship_count_similarity( - self, - ts_parser_default: TreeSitterSymbolParser, - ts_parser_astgrep: TreeSitterSymbolParser, - sample_path: Path, - ) -> None: - """Test that relationship counts are similar (>95% consistency).""" - self._skip_if_astgrep_unavailable(ts_parser_astgrep) - - source_code = sample_path.read_text() - - result_ts = ts_parser_default.parse(source_code, sample_path) - result_astgrep = ts_parser_astgrep.parse(source_code, sample_path) - - assert result_ts is not None - assert result_astgrep is not None - - ts_count = len(result_ts.relationships) - astgrep_count = len(result_astgrep.relationships) - - # Calculate consistency percentage - if max(ts_count, astgrep_count) == 0: - consistency = 100.0 - else: - consistency = ( - min(ts_count, astgrep_count) / max(ts_count, astgrep_count) * 100 - ) - - # Require >95% consistency - assert consistency >= 95.0, ( - f"Relationship consistency {consistency:.1f}% below 95% threshold " - f"(tree-sitter: {ts_count}, ast-grep: {astgrep_count})" - ) - - def test_config_switch_affects_parser( - self, sample_path: Path - ) -> None: - """Test that config.use_astgrep affects which parser is used.""" - config_default = Config() - config_astgrep = Config() - config_astgrep.use_astgrep = True - - parser_default = TreeSitterSymbolParser("python", config=config_default) - parser_astgrep = TreeSitterSymbolParser("python", config=config_astgrep) - - # Default parser should not have ast-grep processor - assert parser_default._astgrep_processor is None - - # Ast-grep parser may have processor if ast-grep-py is installed - # (could be None if not installed, which is fine) - if parser_astgrep._astgrep_processor is not None: - # If available, verify it's the right type - from codexlens.parsers.astgrep_processor import AstGrepPythonProcessor - - assert isinstance( - parser_astgrep._astgrep_processor, AstGrepPythonProcessor - ) - - def test_fallback_to_treesitter_on_astgrep_failure( - self, - ts_parser_astgrep: TreeSitterSymbolParser, - sample_path: Path, - ) -> None: - """Test that parser falls back to tree-sitter if ast-grep fails.""" - source_code = sample_path.read_text() - - # Even with use_astgrep=True, should get valid results - result = ts_parser_astgrep.parse(source_code, sample_path) - - # Should always return a valid result (either from ast-grep or tree-sitter fallback) - assert result is not None - assert result.language == "python" - assert len(result.relationships) > 0 - - -class TestSimpleCodeSamples: - """Test with simple code samples for precise comparison.""" - - def test_simple_inheritance(self) -> None: - """Test simple single inheritance.""" - code = """ -class Parent: - pass - -class Child(Parent): - pass -""" - self._compare_parsers(code, expected_inherits={("Child", "Parent")}) - - def test_multiple_inheritance(self) -> None: - """Test multiple inheritance.""" - code = """ -class A: - pass - -class B: - pass - -class C(A, B): - pass -""" - self._compare_parsers( - code, expected_inherits={("C", "A"), ("C", "B")} - ) - - def test_simple_imports(self) -> None: - """Test simple import statements.""" - code = """ -import os -import sys -""" - config_ts = Config() - config_ag = Config() - config_ag.use_astgrep = True - - parser_ts = TreeSitterSymbolParser("python", config=config_ts) - parser_ag = TreeSitterSymbolParser("python", config=config_ag) - - tmp_path = Path("test.py") - result_ts = parser_ts.parse(code, tmp_path) - result_ag = parser_ag.parse(code, tmp_path) - - assert result_ts is not None - # ast-grep result may be None if not installed - - if result_ag is not None: - ts_imports = { - r.target_symbol - for r in result_ts.relationships - if r.relationship_type == RelationshipType.IMPORTS - } - ag_imports = { - r.target_symbol - for r in result_ag.relationships - if r.relationship_type == RelationshipType.IMPORTS - } - assert ts_imports == ag_imports - - def test_imports_inside_function(self) -> None: - """Test simple import inside a function scope is recorded. - - Note: module-level imports are recorded under a synthetic "" scope. - This test ensures imports inside a function scope are also recorded. - """ - code = """ -def my_function(): - import collections - return collections -""" - config_ts = Config() - config_ag = Config() - config_ag.use_astgrep = True - - parser_ts = TreeSitterSymbolParser("python", config=config_ts) - parser_ag = TreeSitterSymbolParser("python", config=config_ag) - - tmp_path = Path("test.py") - result_ts = parser_ts.parse(code, tmp_path) - result_ag = parser_ag.parse(code, tmp_path) - - assert result_ts is not None - - # Get import relationship targets - ts_imports = [ - r.target_symbol - for r in result_ts.relationships - if r.relationship_type == RelationshipType.IMPORTS - ] - - # Should have collections - ts_has_collections = any("collections" in t for t in ts_imports) - assert ts_has_collections, f"Expected collections import, got: {ts_imports}" - - # If ast-grep is available, verify it also finds the imports - if result_ag is not None: - ag_imports = [ - r.target_symbol - for r in result_ag.relationships - if r.relationship_type == RelationshipType.IMPORTS - ] - ag_has_collections = any("collections" in t for t in ag_imports) - assert ag_has_collections, f"Expected collections import in ast-grep, got: {ag_imports}" - - def _compare_parsers( - self, - code: str, - expected_inherits: Set[Tuple[str, str]], - ) -> None: - """Helper to compare parser outputs for inheritance.""" - config_ts = Config() - config_ag = Config() - config_ag.use_astgrep = True - - parser_ts = TreeSitterSymbolParser("python", config=config_ts) - parser_ag = TreeSitterSymbolParser("python", config=config_ag) - - tmp_path = Path("test.py") - result_ts = parser_ts.parse(code, tmp_path) - - assert result_ts is not None - - # Verify tree-sitter finds expected inheritance - ts_inherits = { - (r.source_symbol, r.target_symbol) - for r in result_ts.relationships - if r.relationship_type == RelationshipType.INHERITS - } - assert ts_inherits == expected_inherits - - # If ast-grep is available, verify it matches - result_ag = parser_ag.parse(code, tmp_path) - if result_ag is not None: - ag_inherits = { - (r.source_symbol, r.target_symbol) - for r in result_ag.relationships - if r.relationship_type == RelationshipType.INHERITS - } - assert ag_inherits == expected_inherits - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/codex-lens/tests/parsers/test_comparison_js_ts.py b/codex-lens/tests/parsers/test_comparison_js_ts.py deleted file mode 100644 index a5a1d1b6..00000000 --- a/codex-lens/tests/parsers/test_comparison_js_ts.py +++ /dev/null @@ -1,150 +0,0 @@ -"""Comparison tests for tree-sitter vs ast-grep JS/TS relationship extraction. - -These tests focus on stable, high-signal relationship types used by the -static graph pipeline: -- IMPORTS -- INHERITS - -If ast-grep-py is not installed, tests are skipped. -""" - -from __future__ import annotations - -from pathlib import Path -from typing import List, Set, Tuple - -import pytest - -from codexlens.config import Config -from codexlens.entities import CodeRelationship, RelationshipType -from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser - - -SAMPLE_JS_CODE = """ -import React, { useEffect as useEf } from "react"; -import { foo } from "./foo"; -import "./styles.css"; -const fs = require("fs"); - -class Base {} -class Child extends Base { - method() { - console.log("hi"); - } -} -""" - - -SAMPLE_TS_CODE = """ -import type { Foo } from "pkg"; -import { bar as baz } from "./bar"; - -interface MyInterface extends Foo {} - -class Base {} -class Child extends Base {} -""" - - -def extract_relationship_tuples( - relationships: List[CodeRelationship], - *, - only_types: Set[RelationshipType], -) -> Set[Tuple[str, str, str]]: - return { - (rel.source_symbol, rel.target_symbol, rel.relationship_type.value) - for rel in relationships - if rel.relationship_type in only_types - } - - -def _skip_if_astgrep_unavailable(parser: TreeSitterSymbolParser) -> None: - if parser._astgrep_processor is None or not parser._astgrep_processor.is_available(): # type: ignore[attr-defined] - pytest.skip("ast-grep-py not installed or language not supported") - - -def test_js_imports_and_inherits_match(tmp_path: Path) -> None: - js_file = tmp_path / "sample.js" - js_file.write_text(SAMPLE_JS_CODE, encoding="utf-8") - source = js_file.read_text(encoding="utf-8") - - config_default = Config() - config_default.use_astgrep = False - ts_default = TreeSitterSymbolParser("javascript", js_file, config=config_default) - - config_ast = Config() - config_ast.use_astgrep = True - ts_ast = TreeSitterSymbolParser("javascript", js_file, config=config_ast) - _skip_if_astgrep_unavailable(ts_ast) - - result_ts = ts_default.parse(source, js_file) - result_ast = ts_ast.parse(source, js_file) - - assert result_ts is not None - assert result_ast is not None - - ts_imports = extract_relationship_tuples( - result_ts.relationships, - only_types={RelationshipType.IMPORTS}, - ) - ast_imports = extract_relationship_tuples( - result_ast.relationships, - only_types={RelationshipType.IMPORTS}, - ) - assert ast_imports == ts_imports - - ts_inherits = extract_relationship_tuples( - result_ts.relationships, - only_types={RelationshipType.INHERITS}, - ) - ast_inherits = extract_relationship_tuples( - result_ast.relationships, - only_types={RelationshipType.INHERITS}, - ) - # Ast-grep may include inheritance edges that the tree-sitter extractor does not currently emit. - assert ts_inherits.issubset(ast_inherits) - assert ("Child", "Base", "inherits") in ast_inherits - - -def test_ts_imports_match_and_inherits_superset(tmp_path: Path) -> None: - ts_file = tmp_path / "sample.ts" - ts_file.write_text(SAMPLE_TS_CODE, encoding="utf-8") - source = ts_file.read_text(encoding="utf-8") - - config_default = Config() - config_default.use_astgrep = False - ts_default = TreeSitterSymbolParser("typescript", ts_file, config=config_default) - - config_ast = Config() - config_ast.use_astgrep = True - ts_ast = TreeSitterSymbolParser("typescript", ts_file, config=config_ast) - _skip_if_astgrep_unavailable(ts_ast) - - result_ts = ts_default.parse(source, ts_file) - result_ast = ts_ast.parse(source, ts_file) - - assert result_ts is not None - assert result_ast is not None - - ts_imports = extract_relationship_tuples( - result_ts.relationships, - only_types={RelationshipType.IMPORTS}, - ) - ast_imports = extract_relationship_tuples( - result_ast.relationships, - only_types={RelationshipType.IMPORTS}, - ) - assert ast_imports == ts_imports - - ts_inherits = extract_relationship_tuples( - result_ts.relationships, - only_types={RelationshipType.INHERITS}, - ) - ast_inherits = extract_relationship_tuples( - result_ast.relationships, - only_types={RelationshipType.INHERITS}, - ) - # Ast-grep may include additional TypeScript inheritance edges (e.g., interface extends). - assert ts_inherits.issubset(ast_inherits) - # But at minimum, class inheritance should be present. - assert ("Child", "Base", "inherits") in ast_inherits diff --git a/codex-lens/tests/real/__init__.py b/codex-lens/tests/real/__init__.py deleted file mode 100644 index da6c5ff3..00000000 --- a/codex-lens/tests/real/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Real interface tests for LSP integration. - -These tests require VSCode Bridge to be running. -See test_lsp_real_interface.py for details. -""" diff --git a/codex-lens/tests/real/comparison_test.py b/codex-lens/tests/real/comparison_test.py deleted file mode 100644 index da19a601..00000000 --- a/codex-lens/tests/real/comparison_test.py +++ /dev/null @@ -1,162 +0,0 @@ -#!/usr/bin/env python -"""Direct comparison: standalone manager vs direct subprocess.""" - -import asyncio -import json -import sys -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -async def test_direct(): - """Direct subprocess test that WORKS.""" - print("\n=== DIRECT SUBPROCESS TEST ===") - - process = await asyncio.create_subprocess_exec( - 'pyright-langserver', '--stdio', - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - cwd=str(Path(__file__).parent.parent.parent), - ) - - def encode(msg): - body = json.dumps(msg).encode('utf-8') - header = f'Content-Length: {len(body)}\r\n\r\n'.encode('ascii') - return header + body - - async def read_message(timeout=5.0): - content_length = 0 - while True: - try: - line = await asyncio.wait_for(process.stdout.readline(), timeout=timeout) - except asyncio.TimeoutError: - return None - if not line: - return None - line_str = line.decode('ascii').strip() - if not line_str: - break - if line_str.lower().startswith('content-length:'): - content_length = int(line_str.split(':')[1].strip()) - if content_length == 0: - return None - body = await process.stdout.readexactly(content_length) - return json.loads(body.decode('utf-8')) - - # Initialize - init = { - 'jsonrpc': '2.0', 'id': 1, 'method': 'initialize', - 'params': { - 'processId': 12345, - 'rootUri': 'file:///D:/Claude_dms3/codex-lens', - 'rootPath': 'D:/Claude_dms3/codex-lens', - 'capabilities': { - 'textDocument': { - 'synchronization': {'dynamicRegistration': False}, - 'documentSymbol': {'hierarchicalDocumentSymbolSupport': True}, - }, - 'workspace': {'configuration': True, 'workspaceFolders': True}, - }, - 'workspaceFolders': [{'uri': 'file:///D:/Claude_dms3/codex-lens', 'name': 'codex-lens'}], - 'initializationOptions': {}, - } - } - process.stdin.write(encode(init)) - await process.stdin.drain() - - while True: - msg = await read_message(5.0) - if msg is None or msg.get('id') == 1: - print(f" Got initialize response") - break - - # Initialized - process.stdin.write(encode({'jsonrpc': '2.0', 'method': 'initialized', 'params': {}})) - await process.stdin.drain() - print(" Sent initialized") - - # didOpen with simple content - did_open = { - 'jsonrpc': '2.0', 'method': 'textDocument/didOpen', - 'params': { - 'textDocument': { - 'uri': 'file:///D:/Claude_dms3/codex-lens/simple.py', - 'languageId': 'python', - 'version': 1, - 'text': 'def hello():\n pass\n' - } - } - } - process.stdin.write(encode(did_open)) - await process.stdin.drain() - print(" Sent didOpen") - - # Read and respond to configuration requests - print(" Waiting for messages...") - for i in range(15): - msg = await read_message(2.0) - if msg is None: - continue - method = msg.get('method') - print(f" RECV: id={msg.get('id')}, method={method}") - if method == 'workspace/configuration': - process.stdin.write(encode({'jsonrpc': '2.0', 'id': msg['id'], 'result': [{}]})) - await process.stdin.drain() - if method == 'textDocument/publishDiagnostics': - break - - # documentSymbol - doc_sym = { - 'jsonrpc': '2.0', 'id': 2, 'method': 'textDocument/documentSymbol', - 'params': {'textDocument': {'uri': 'file:///D:/Claude_dms3/codex-lens/simple.py'}} - } - process.stdin.write(encode(doc_sym)) - await process.stdin.drain() - print(" Sent documentSymbol") - - for i in range(5): - msg = await read_message(3.0) - if msg is None: - continue - if msg.get('id') == 2: - result = msg.get('result', []) - print(f" GOT {len(result)} SYMBOLS!") - break - - process.terminate() - await process.wait() - - -async def test_manager(): - """Standalone manager test that FAILS.""" - print("\n=== STANDALONE MANAGER TEST ===") - - from codexlens.lsp.standalone_manager import StandaloneLspManager - - workspace = Path(__file__).parent.parent.parent - manager = StandaloneLspManager( - workspace_root=str(workspace), - timeout=30.0 - ) - - await manager.start() - - simple_file = workspace / "simple.py" - simple_file.write_text('def hello():\n pass\n') - - try: - symbols = await manager.get_document_symbols(str(simple_file)) - print(f" GOT {len(symbols)} SYMBOLS!") - finally: - simple_file.unlink(missing_ok=True) - await manager.stop() - - -async def main(): - await test_direct() - await test_manager() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/codex-lens/tests/real/concurrent_test.py b/codex-lens/tests/real/concurrent_test.py deleted file mode 100644 index 08ba4162..00000000 --- a/codex-lens/tests/real/concurrent_test.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python -"""Test concurrent read loop behavior.""" - -import asyncio -import sys -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -import logging -logging.basicConfig(level=logging.DEBUG, format='%(name)s - %(levelname)s - %(message)s') - -from codexlens.lsp.standalone_manager import StandaloneLspManager - -async def test(): - workspace = Path(__file__).parent.parent.parent - manager = StandaloneLspManager( - workspace_root=str(workspace), - timeout=30.0 - ) - - await manager.start() - - # Get server for a simple file - simple_content = "def hello():\n pass\n" - simple_file = workspace / "test_simple.py" - simple_file.write_text(simple_content) - - try: - print("\n=== Getting server ===") - state = await manager._get_server(str(simple_file)) - print(f"Server state: initialized={state.initialized if state else 'None'}") - - print("\n=== Sending didOpen ===") - await manager._send_notification(state, "textDocument/didOpen", { - "textDocument": { - "uri": simple_file.as_uri(), - "languageId": "python", - "version": 1, - "text": simple_content, - } - }) - - print("\n=== Waiting 5 seconds - watch for server requests ===") - for i in range(5): - print(f" Tick {i+1}...") - await asyncio.sleep(1.0) - - print("\n=== Sending documentSymbol ===") - result = await manager._send_request( - state, - "textDocument/documentSymbol", - {"textDocument": {"uri": simple_file.as_uri()}}, - timeout=10.0 - ) - print(f"Result: {result}") - - finally: - simple_file.unlink(missing_ok=True) - await manager.stop() - -if __name__ == "__main__": - asyncio.run(test()) diff --git a/codex-lens/tests/real/debug_compare.py b/codex-lens/tests/real/debug_compare.py deleted file mode 100644 index 77f3b022..00000000 --- a/codex-lens/tests/real/debug_compare.py +++ /dev/null @@ -1,149 +0,0 @@ -#!/usr/bin/env python -"""Compare manager read behavior vs direct read.""" - -import asyncio -import json -import sys -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -from codexlens.lsp.standalone_manager import StandaloneLspManager - - -async def direct_test(): - """Direct communication - this works.""" - workspace = Path(__file__).parent.parent.parent - print("\n=== DIRECT TEST ===") - - process = await asyncio.create_subprocess_exec( - "pyright-langserver", "--stdio", - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - cwd=str(workspace), - ) - - def encode_message(content): - body = json.dumps(content).encode("utf-8") - header = f"Content-Length: {len(body)}\r\n\r\n" - return header.encode("ascii") + body - - async def send(message): - encoded = encode_message(message) - process.stdin.write(encoded) - await process.stdin.drain() - msg_desc = message.get('method') or f"response id={message.get('id')}" - print(f" SENT: {msg_desc}") - - async def read_one(): - content_length = 0 - while True: - line = await asyncio.wait_for(process.stdout.readline(), timeout=3.0) - if not line: - return None - line_str = line.decode("ascii").strip() - if not line_str: - break - if line_str.lower().startswith("content-length:"): - content_length = int(line_str.split(":")[1].strip()) - if content_length == 0: - return None - body = await process.stdout.readexactly(content_length) - return json.loads(body.decode("utf-8")) - - # Initialize - print(" Sending initialize...") - await send({ - "jsonrpc": "2.0", "id": 1, "method": "initialize", - "params": { - "processId": None, - "rootUri": workspace.as_uri(), - "capabilities": {"workspace": {"configuration": True}}, - "workspaceFolders": [{"uri": workspace.as_uri(), "name": workspace.name}], - }, - }) - - # Read until response - while True: - msg = await read_one() - if msg and msg.get("id") == 1: - print(f" Initialize response OK") - break - elif msg: - print(f" Notification: {msg.get('method')}") - - # Send initialized - print(" Sending initialized...") - await send({"jsonrpc": "2.0", "method": "initialized", "params": {}}) - - # Check for workspace/configuration - print(" Checking for workspace/configuration (3s timeout)...") - try: - for i in range(10): - msg = await read_one() - if msg: - method = msg.get("method") - msg_id = msg.get("id") - print(f" RECV: {method or 'response'} (id={msg_id})") - if method == "workspace/configuration": - print(" SUCCESS: workspace/configuration received!") - break - except asyncio.TimeoutError: - print(" TIMEOUT: No more messages") - - process.terminate() - await process.wait() - - -async def manager_test(): - """Manager communication - investigating why this doesn't work.""" - workspace = Path(__file__).parent.parent.parent - print("\n=== MANAGER TEST ===") - - manager = StandaloneLspManager( - workspace_root=str(workspace), - timeout=60.0 - ) - await manager.start() - - # Just check if server initialized - state = manager._servers.get("python") - if state: - print(f" Server initialized: {state.initialized}") - print(f" Capabilities: {len(state.capabilities)} keys") - else: - # Force initialization by getting server for a Python file - print(" Getting server for Python file...") - test_file = workspace / "tests" / "real" / "debug_compare.py" - state = await manager._get_server(str(test_file)) - if state: - print(f" Server initialized: {state.initialized}") - - # Try to read directly from state.reader - if state: - print("\n Direct read test from state.reader:") - print(f" state.reader is: {type(state.reader)}") - print(f" state.reader at_eof: {state.reader.at_eof()}") - - # Check if there's data available - try: - line = await asyncio.wait_for(state.reader.readline(), timeout=1.0) - if line: - print(f" Got line: {line[:50]}...") - else: - print(f" readline returned empty (EOF)") - except asyncio.TimeoutError: - print(f" readline timed out (no data)") - - await manager.stop() - - -async def main(): - await direct_test() - await manager_test() - print("\n=== DONE ===") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/codex-lens/tests/real/debug_config.py b/codex-lens/tests/real/debug_config.py deleted file mode 100644 index 90fae268..00000000 --- a/codex-lens/tests/real/debug_config.py +++ /dev/null @@ -1,216 +0,0 @@ -#!/usr/bin/env python -"""Test if pyright sends workspace/configuration after initialized.""" - -import asyncio -import json -import sys -from pathlib import Path - -# Add source to path -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - - -async def read_message_direct(reader): - """Read a JSON-RPC message - direct blocking read, no timeout.""" - content_length = 0 - while True: - line = await reader.readline() - if not line: - return None - line_str = line.decode("ascii").strip() - if not line_str: - break - if line_str.lower().startswith("content-length:"): - content_length = int(line_str.split(":")[1].strip()) - - if content_length == 0: - return None - - body = await reader.readexactly(content_length) - return json.loads(body.decode("utf-8")) - - -async def main(): - workspace = Path(__file__).parent.parent.parent - print(f"Workspace: {workspace}") - - # Start pyright - exactly like in direct test - process = await asyncio.create_subprocess_exec( - "pyright-langserver", "--stdio", - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - cwd=str(workspace), - ) - - def encode_message(content): - body = json.dumps(content).encode("utf-8") - header = f"Content-Length: {len(body)}\r\n\r\n" - return header.encode("ascii") + body - - async def send(message): - encoded = encode_message(message) - process.stdin.write(encoded) - await process.stdin.drain() - method_or_resp = message.get('method') or f"response id={message.get('id')}" - print(f"SENT: {method_or_resp} ({len(encoded)} bytes)") - - # Start stderr reader - async def read_stderr(): - while True: - line = await process.stderr.readline() - if not line: - break - print(f"[stderr] {line.decode('utf-8', errors='replace').rstrip()}") - asyncio.create_task(read_stderr()) - - print("\n=== INITIALIZE ===") - await send({ - "jsonrpc": "2.0", - "id": 1, - "method": "initialize", - "params": { - "processId": None, - "rootUri": workspace.as_uri(), - "rootPath": str(workspace), - "capabilities": { - "workspace": {"configuration": True}, - }, - "workspaceFolders": [{"uri": workspace.as_uri(), "name": workspace.name}], - }, - }) - - # Read until we get initialize response - print("Reading initialize response...") - while True: - msg = await asyncio.wait_for(read_message_direct(process.stdout), timeout=10) - if msg is None: - break - method = msg.get("method") - msg_id = msg.get("id") - if method: - print(f"RECV: {method} (notification)") - else: - print(f"RECV: response id={msg_id}") - if msg_id == 1: - print("Initialize OK!") - break - - print("\n=== SEND INITIALIZED ===") - await send({ - "jsonrpc": "2.0", - "method": "initialized", - "params": {}, - }) - - # Now, here's the key test - will we receive workspace/configuration? - print("\n=== WAIT FOR workspace/configuration ===") - print("Reading with 5 second timeout...") - - try: - for i in range(10): - msg = await asyncio.wait_for(read_message_direct(process.stdout), timeout=2) - if msg is None: - print("EOF") - break - method = msg.get("method") - msg_id = msg.get("id") - print(f"RECV: method={method}, id={msg_id}") - - # Respond to server requests - if msg_id is not None and method: - if method == "workspace/configuration": - print(" -> Got workspace/configuration! Responding...") - await send({ - "jsonrpc": "2.0", - "id": msg_id, - "result": [{} for _ in msg.get("params", {}).get("items", [])], - }) - else: - print(f" -> Responding to {method}") - await send({"jsonrpc": "2.0", "id": msg_id, "result": None}) - except asyncio.TimeoutError: - print("No more messages (timeout)") - - print("\n=== Now start background read task like manager does ===") - - # Store references like manager does - reader = process.stdout # This is how manager does it - writer = process.stdin - - # Start background read task - async def bg_read_loop(): - print("[BG] Read loop started") - try: - while True: - await asyncio.sleep(0) - try: - msg = await asyncio.wait_for(read_message_direct(reader), timeout=1.0) - if msg is None: - print("[BG] Stream closed") - break - bg_method = msg.get('method') or f"response id={msg.get('id')}" - print(f"[BG] RECV: {bg_method}") - - # Handle server requests - method = msg.get("method") - msg_id = msg.get("id") - if msg_id is not None and method: - print(f"[BG] Responding to {method}") - await send({"jsonrpc": "2.0", "id": msg_id, "result": None}) - except asyncio.TimeoutError: - print("[BG] timeout") - except asyncio.CancelledError: - print("[BG] Cancelled") - - read_task = asyncio.create_task(bg_read_loop()) - - # Wait a moment - await asyncio.sleep(0.5) - - # Now send didOpen and documentSymbol like manager does - print("\n=== SEND didOpen ===") - test_file = workspace / "tests" / "real" / "debug_config.py" - await send({ - "jsonrpc": "2.0", - "method": "textDocument/didOpen", - "params": { - "textDocument": { - "uri": test_file.as_uri(), - "languageId": "python", - "version": 1, - "text": test_file.read_text(), - }, - }, - }) - - # Wait for processing - await asyncio.sleep(2) - - print("\n=== SEND documentSymbol ===") - await send({ - "jsonrpc": "2.0", - "id": 2, - "method": "textDocument/documentSymbol", - "params": {"textDocument": {"uri": test_file.as_uri()}}, - }) - - # Wait for response - print("Waiting for documentSymbol response (max 30s)...") - deadline = asyncio.get_event_loop().time() + 30 - while asyncio.get_event_loop().time() < deadline: - await asyncio.sleep(0.5) - # The background task will print when it receives the response - - read_task.cancel() - try: - await read_task - except asyncio.CancelledError: - pass - - process.terminate() - print("\nDone!") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/codex-lens/tests/real/debug_direct.py b/codex-lens/tests/real/debug_direct.py deleted file mode 100644 index 99e4b992..00000000 --- a/codex-lens/tests/real/debug_direct.py +++ /dev/null @@ -1,320 +0,0 @@ -#!/usr/bin/env python -"""Minimal direct test of pyright LSP communication.""" - -import asyncio -import json -import sys -from pathlib import Path - - -async def send_message(writer, message): - """Send a JSON-RPC message.""" - body = json.dumps(message).encode("utf-8") - header = f"Content-Length: {len(body)}\r\n\r\n".encode("ascii") - writer.write(header + body) - await writer.drain() - print(f"SENT: {message.get('method', 'response')} (id={message.get('id', 'N/A')})") - - -async def read_message(reader): - """Read a JSON-RPC message.""" - # Read headers - content_length = 0 - while True: - line = await reader.readline() - if not line: - return None - line_str = line.decode("ascii").strip() - if not line_str: - break - if line_str.lower().startswith("content-length:"): - content_length = int(line_str.split(":")[1].strip()) - - if content_length == 0: - return None - - # Read body - body = await reader.readexactly(content_length) - return json.loads(body.decode("utf-8")) - - -async def main(): - workspace = Path(__file__).parent.parent.parent - test_file = workspace / "tests" / "real" / "debug_direct.py" - - print(f"Workspace: {workspace}") - print(f"Test file: {test_file}") - print() - - # Start pyright - print("Starting pyright-langserver...") - process = await asyncio.create_subprocess_exec( - "pyright-langserver", "--stdio", - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - cwd=str(workspace), - ) - - # Start stderr reader - async def read_stderr(): - while True: - line = await process.stderr.readline() - if not line: - break - print(f"[stderr] {line.decode('utf-8', errors='replace').rstrip()}") - - stderr_task = asyncio.create_task(read_stderr()) - - try: - # 1. Send initialize - print("\n=== INITIALIZE ===") - await send_message(process.stdin, { - "jsonrpc": "2.0", - "id": 1, - "method": "initialize", - "params": { - "processId": None, - "rootUri": workspace.as_uri(), - "rootPath": str(workspace), - "capabilities": { - "textDocument": { - "documentSymbol": { - "hierarchicalDocumentSymbolSupport": True, - }, - }, - "workspace": { - "configuration": True, - }, - }, - "workspaceFolders": [{"uri": workspace.as_uri(), "name": workspace.name}], - }, - }) - - # Read all messages until we get initialize response - print("\n=== READING RESPONSES ===") - init_done = False - for i in range(20): - try: - msg = await asyncio.wait_for(read_message(process.stdout), timeout=5.0) - if msg is None: - print("EOF") - break - - method = msg.get("method", "") - msg_id = msg.get("id", "N/A") - - if method: - print(f"RECV: {method} (id={msg_id})") - - # Handle server requests - if msg_id != "N/A": - if method == "workspace/configuration": - print(" -> Responding to workspace/configuration") - items = msg.get("params", {}).get("items", []) - await send_message(process.stdin, { - "jsonrpc": "2.0", - "id": msg_id, - "result": [{"pythonPath": "python"} for _ in items], - }) - elif method == "client/registerCapability": - print(" -> Responding to client/registerCapability") - await send_message(process.stdin, { - "jsonrpc": "2.0", - "id": msg_id, - "result": None, - }) - elif method == "window/workDoneProgress/create": - print(" -> Responding to window/workDoneProgress/create") - await send_message(process.stdin, { - "jsonrpc": "2.0", - "id": msg_id, - "result": None, - }) - else: - print(f"RECV: response (id={msg_id})") - if msg_id == 1: - print(" -> Initialize response received!") - caps = list(msg.get("result", {}).get("capabilities", {}).keys()) - print(f" -> Capabilities: {caps[:5]}...") - init_done = True - break - - except asyncio.TimeoutError: - print(f" Timeout waiting for message {i+1}") - break - - if not init_done: - print("ERROR: Initialize failed") - return - - # 2. Send initialized notification - print("\n=== INITIALIZED ===") - await send_message(process.stdin, { - "jsonrpc": "2.0", - "method": "initialized", - "params": {}, - }) - - # Read any messages pyright sends after initialized - print("\n=== READING POST-INITIALIZED MESSAGES ===") - for i in range(10): - try: - msg = await asyncio.wait_for(read_message(process.stdout), timeout=2.0) - if msg is None: - break - - method = msg.get("method", "") - msg_id = msg.get("id", "N/A") - - print(f"RECV: {method or 'response'} (id={msg_id})") - - # Handle server requests - if msg_id != "N/A" and method: - if method == "workspace/configuration": - print(" -> Responding to workspace/configuration") - items = msg.get("params", {}).get("items", []) - await send_message(process.stdin, { - "jsonrpc": "2.0", - "id": msg_id, - "result": [{"pythonPath": "python"} for _ in items], - }) - elif method == "client/registerCapability": - print(" -> Responding to client/registerCapability") - await send_message(process.stdin, { - "jsonrpc": "2.0", - "id": msg_id, - "result": None, - }) - elif method == "window/workDoneProgress/create": - print(" -> Responding to window/workDoneProgress/create") - await send_message(process.stdin, { - "jsonrpc": "2.0", - "id": msg_id, - "result": None, - }) - - except asyncio.TimeoutError: - print(f" No more messages (timeout)") - break - - # 3. Send didOpen - print("\n=== DIDOPEN ===") - content = test_file.read_text(encoding="utf-8") - await send_message(process.stdin, { - "jsonrpc": "2.0", - "method": "textDocument/didOpen", - "params": { - "textDocument": { - "uri": test_file.as_uri(), - "languageId": "python", - "version": 1, - "text": content, - }, - }, - }) - - # Read any messages - print("\n=== READING POST-DIDOPEN MESSAGES ===") - for i in range(10): - try: - msg = await asyncio.wait_for(read_message(process.stdout), timeout=2.0) - if msg is None: - break - - method = msg.get("method", "") - msg_id = msg.get("id", "N/A") - - print(f"RECV: {method or 'response'} (id={msg_id})") - - # Handle server requests - if msg_id != "N/A" and method: - if method == "workspace/configuration": - print(" -> Responding to workspace/configuration") - items = msg.get("params", {}).get("items", []) - await send_message(process.stdin, { - "jsonrpc": "2.0", - "id": msg_id, - "result": [{"pythonPath": "python"} for _ in items], - }) - else: - print(f" -> Responding with null to {method}") - await send_message(process.stdin, { - "jsonrpc": "2.0", - "id": msg_id, - "result": None, - }) - - except asyncio.TimeoutError: - print(f" No more messages (timeout)") - break - - # 4. Send documentSymbol request - print("\n=== DOCUMENTSYMBOL ===") - await send_message(process.stdin, { - "jsonrpc": "2.0", - "id": 2, - "method": "textDocument/documentSymbol", - "params": { - "textDocument": {"uri": test_file.as_uri()}, - }, - }) - - # Wait for response - print("\n=== READING DOCUMENTSYMBOL RESPONSE ===") - for i in range(20): - try: - msg = await asyncio.wait_for(read_message(process.stdout), timeout=5.0) - if msg is None: - break - - method = msg.get("method", "") - msg_id = msg.get("id", "N/A") - - if method: - print(f"RECV: {method} (id={msg_id})") - - # Handle server requests - if msg_id != "N/A": - if method == "workspace/configuration": - print(" -> Responding to workspace/configuration") - items = msg.get("params", {}).get("items", []) - await send_message(process.stdin, { - "jsonrpc": "2.0", - "id": msg_id, - "result": [{"pythonPath": "python"} for _ in items], - }) - else: - print(f" -> Responding with null to {method}") - await send_message(process.stdin, { - "jsonrpc": "2.0", - "id": msg_id, - "result": None, - }) - else: - print(f"RECV: response (id={msg_id})") - if msg_id == 2: - result = msg.get("result", []) - print(f" -> DocumentSymbol response: {len(result)} symbols") - for sym in result[:5]: - print(f" - {sym.get('name')} ({sym.get('kind')})") - break - - except asyncio.TimeoutError: - print(f" Timeout {i+1}") - if i >= 5: - break - - print("\n=== DONE ===") - - finally: - stderr_task.cancel() - process.terminate() - try: - await asyncio.wait_for(process.wait(), timeout=5.0) - except asyncio.TimeoutError: - process.kill() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/codex-lens/tests/real/debug_lsp.py b/codex-lens/tests/real/debug_lsp.py deleted file mode 100644 index 8bf15f1c..00000000 --- a/codex-lens/tests/real/debug_lsp.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python -"""Debug script to check pyright LSP configuration requests.""" - -import asyncio -import logging -import sys -from pathlib import Path - -# Enable DEBUG logging -logging.basicConfig( - level=logging.DEBUG, - format='%(name)s - %(levelname)s - %(message)s', - stream=sys.stdout -) - -# Add source to path -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -from codexlens.lsp.standalone_manager import StandaloneLspManager - -async def test(): - workspace = Path(__file__).parent.parent.parent - manager = StandaloneLspManager( - workspace_root=str(workspace), - timeout=60.0 - ) - await manager.start() - - # Wait a bit after start to see if any requests come in - print("Waiting 3 seconds after start to see server requests...") - await asyncio.sleep(3) - - # Try to get symbols for a simpler file - test_file = str(workspace / "tests" / "real" / "debug_lsp.py") - print(f"Testing with: {test_file}") - - # Let's see if we can check what pyright sees - print("Checking server state...") - state = manager._servers.get("python") - if state: - print(f" - Process running: {state.process.returncode is None}") - print(f" - Initialized: {state.initialized}") - print(f" - Pending requests: {list(state.pending_requests.keys())}") - - try: - symbols = await manager.get_document_symbols(test_file) - print(f"Got {len(symbols)} symbols") - for s in symbols[:5]: - print(f" - {s}") - except Exception as e: - print(f"Error: {e}") - import traceback - traceback.print_exc() - - await manager.stop() - -if __name__ == "__main__": - asyncio.run(test()) diff --git a/codex-lens/tests/real/debug_manager.py b/codex-lens/tests/real/debug_manager.py deleted file mode 100644 index 3d53ca89..00000000 --- a/codex-lens/tests/real/debug_manager.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python -"""Debug script to test StandaloneLspManager directly.""" - -import asyncio -import logging -import sys -from pathlib import Path - -# Add source to path -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -# Enable debug logging -logging.basicConfig(level=logging.DEBUG, format="%(levelname)s: %(name)s: %(message)s") - -from codexlens.lsp.standalone_manager import StandaloneLspManager - - -async def test_standalone_manager(): - """Test StandaloneLspManager directly.""" - workspace = Path(__file__).parent.parent.parent - test_file = workspace / "src" / "codexlens" / "lsp" / "lsp_bridge.py" - - print(f"Workspace: {workspace}") - print(f"Test file: {test_file}") - print() - - manager = StandaloneLspManager(workspace_root=str(workspace), timeout=30.0) - - print("Starting manager...") - await manager.start() - - print(f"Configs loaded: {list(manager._configs.keys())}") - print(f"Servers running: {list(manager._servers.keys())}") - - # Try to get the server for the test file - print(f"\nGetting server for {test_file.name}...") - server = await manager._get_server(str(test_file)) - - if server: - print(f"Server: {server.config.display_name}") - print(f"Initialized: {server.initialized}") - print(f"Capabilities: {list(server.capabilities.keys())}") - else: - print("Failed to get server!") - - # Try to get document symbols - print(f"\nGetting document symbols for {test_file.name}...") - try: - symbols = await manager.get_document_symbols(str(test_file)) - print(f"Found {len(symbols)} symbols") - for sym in symbols[:5]: - print(f" - {sym.get('name', '?')} ({sym.get('kind', '?')})") - except Exception as e: - print(f"Error getting symbols: {e}") - - print("\nStopping manager...") - await manager.stop() - - print("Done!") - - -if __name__ == "__main__": - asyncio.run(test_standalone_manager()) diff --git a/codex-lens/tests/real/debug_reads.py b/codex-lens/tests/real/debug_reads.py deleted file mode 100644 index 56048c73..00000000 --- a/codex-lens/tests/real/debug_reads.py +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/env python -"""Debug exactly what's happening with reads after initialized.""" - -import asyncio -import json -from pathlib import Path - - -async def main(): - workspace = Path(__file__).parent.parent.parent - print(f"Workspace: {workspace}") - - # Start pyright - process = await asyncio.create_subprocess_exec( - "pyright-langserver", "--stdio", - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - cwd=str(workspace), - ) - - # Helper to encode message - def encode(content): - body = json.dumps(content).encode("utf-8") - header = f"Content-Length: {len(body)}\r\n\r\n" - return header.encode("ascii") + body - - # Helper to send - async def send(msg): - encoded = encode(msg) - process.stdin.write(encoded) - await process.stdin.drain() - method = msg.get("method") or f"response-{msg.get('id')}" - print(f"SENT: {method}") - - # Helper to read one message - async def read_one(timeout=3.0): - content_length = 0 - while True: - try: - print(f" readline(timeout={timeout})...") - line = await asyncio.wait_for(process.stdout.readline(), timeout=timeout) - print(f" got line: {repr(line[:50] if len(line) > 50 else line)}") - except asyncio.TimeoutError: - print(f" TIMEOUT on readline") - return None - - if not line: - print(f" EOF") - return None - - line_str = line.decode("ascii").strip() - if not line_str: - break # End of headers - - if line_str.lower().startswith("content-length:"): - content_length = int(line_str.split(":")[1].strip()) - - if content_length == 0: - return None - - body = await process.stdout.readexactly(content_length) - return json.loads(body.decode("utf-8")) - - # Start stderr reader - async def read_stderr(): - while True: - line = await process.stderr.readline() - if not line: - break - print(f"[stderr] {line.decode('utf-8', errors='replace').rstrip()}") - asyncio.create_task(read_stderr()) - - print("\n=== INITIALIZE ===") - await send({ - "jsonrpc": "2.0", "id": 1, "method": "initialize", - "params": { - "processId": None, - "rootUri": workspace.as_uri(), - "capabilities": {"workspace": {"configuration": True}}, - "workspaceFolders": [{"uri": workspace.as_uri(), "name": workspace.name}], - }, - }) - - # Read until initialize response - print("\n=== READING UNTIL INITIALIZE RESPONSE ===") - while True: - msg = await read_one() - if msg and msg.get("id") == 1 and "method" not in msg: - print(f"Got initialize response") - break - elif msg: - print(f"Got notification: {msg.get('method')}") - - print("\n=== SEND INITIALIZED ===") - await send({"jsonrpc": "2.0", "method": "initialized", "params": {}}) - - print("\n=== NOW TRY TO READ WORKSPACE/CONFIGURATION ===") - print("Attempting reads with 2s timeout each...") - - for i in range(3): - print(f"\n--- Read attempt {i+1} ---") - msg = await read_one(timeout=2.0) - if msg: - method = msg.get("method", "") - msg_id = msg.get("id") - print(f"SUCCESS: method={method}, id={msg_id}") - if method and msg_id is not None: - # Respond to server request - print(f"Responding to {method}") - await send({"jsonrpc": "2.0", "id": msg_id, "result": [{}]}) - else: - print(f"No message (timeout or EOF)") - break - - print("\n=== CLEANUP ===") - process.terminate() - await process.wait() - print("Done") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/codex-lens/tests/real/direct_pyright_test.py b/codex-lens/tests/real/direct_pyright_test.py deleted file mode 100644 index 75fd45bf..00000000 --- a/codex-lens/tests/real/direct_pyright_test.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python -"""Direct test of pyright-langserver communication.""" - -import asyncio -import json -import sys - -async def test_pyright(): - print("Starting pyright-langserver...") - - process = await asyncio.create_subprocess_exec( - "pyright-langserver", "--stdio", - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - - # Build initialize request - init_msg = { - "jsonrpc": "2.0", - "id": 1, - "method": "initialize", - "params": { - "processId": 1234, - "rootUri": "file:///D:/Claude_dms3/codex-lens", - "rootPath": "D:/Claude_dms3/codex-lens", - "capabilities": { - "textDocument": { - "documentSymbol": {"hierarchicalDocumentSymbolSupport": True} - }, - "workspace": {"configuration": True} - }, - "workspaceFolders": [ - {"uri": "file:///D:/Claude_dms3/codex-lens", "name": "codex-lens"} - ] - } - } - - body = json.dumps(init_msg).encode("utf-8") - header = f"Content-Length: {len(body)}\r\n\r\n".encode("ascii") - - print(f"Sending initialize request ({len(body)} bytes)...") - process.stdin.write(header + body) - await process.stdin.drain() - - # Read responses - print("Reading responses...") - for i in range(20): - try: - line = await asyncio.wait_for(process.stdout.readline(), timeout=2.0) - if not line: - print(" (empty line - stream closed)") - break - line_str = line.decode("ascii").strip() - print(f" Header: {line_str}") - - if line_str.lower().startswith("content-length:"): - content_length = int(line_str.split(":")[1].strip()) - # Read empty line - await process.stdout.readline() - # Read body - body_data = await process.stdout.readexactly(content_length) - msg = json.loads(body_data.decode("utf-8")) - print(f" Message: id={msg.get('id', 'none')}, method={msg.get('method', 'none')}") - if msg.get("id") == 1: - print(f" >>> GOT INITIALIZE RESPONSE!") - print(f" >>> Capabilities: {list(msg.get('result', {}).get('capabilities', {}).keys())[:10]}...") - - # Send initialized notification - print("\nSending 'initialized' notification...") - init_notif = {"jsonrpc": "2.0", "method": "initialized", "params": {}} - body2 = json.dumps(init_notif).encode("utf-8") - header2 = f"Content-Length: {len(body2)}\r\n\r\n".encode("ascii") - process.stdin.write(header2 + body2) - await process.stdin.drain() - - # Wait a moment for any server requests - print("Waiting for server requests...") - await asyncio.sleep(1.0) - continue # Keep reading to see if workspace/configuration comes - if msg.get("method") == "workspace/configuration": - print(f" >>> GOT workspace/configuration REQUEST!") - print(f" >>> Params: {msg.get('params')}") - except asyncio.TimeoutError: - print(" (timeout waiting for more data)") - break - - process.terminate() - await process.wait() - print("Done.") - -if __name__ == "__main__": - asyncio.run(test_pyright()) diff --git a/codex-lens/tests/real/minimal_test.py b/codex-lens/tests/real/minimal_test.py deleted file mode 100644 index c95c8b7b..00000000 --- a/codex-lens/tests/real/minimal_test.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python -"""Minimal test that mimics the working direct test.""" - -import asyncio -import json -import sys -from pathlib import Path - -# Add source to path -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - - -async def test_minimal(): - """Minimal test using the standalone manager.""" - from codexlens.lsp.standalone_manager import StandaloneLspManager - - workspace = Path(__file__).parent.parent.parent - manager = StandaloneLspManager( - workspace_root=str(workspace), - timeout=60.0 - ) - - await manager.start() - - # Get server state - server_state = await manager._get_server(str(workspace / "tests" / "real" / "minimal_test.py")) - - if not server_state: - print("Failed to get server state") - await manager.stop() - return - - print(f"Server initialized: {server_state.initialized}") - print(f"Server capabilities: {list(server_state.capabilities.keys())[:5]}...") - - # Wait for any background messages - print("Waiting 5 seconds for background messages...") - await asyncio.sleep(5) - - # Now send a documentSymbol request manually - print("Sending documentSymbol request...") - result = await manager._send_request( - server_state, - "textDocument/documentSymbol", - {"textDocument": {"uri": (workspace / "tests" / "real" / "minimal_test.py").resolve().as_uri()}}, - timeout=30.0 - ) - - print(f"Result: {result}") - - await manager.stop() - - -if __name__ == "__main__": - import logging - logging.basicConfig(level=logging.INFO, format='%(name)s - %(levelname)s - %(message)s') - - asyncio.run(test_minimal()) diff --git a/codex-lens/tests/real/quick_test.py b/codex-lens/tests/real/quick_test.py deleted file mode 100644 index c70a5374..00000000 --- a/codex-lens/tests/real/quick_test.py +++ /dev/null @@ -1,313 +0,0 @@ -#!/usr/bin/env python -"""Quick real interface test script for LSP Bridge (Standalone Mode). - -Usage: - python tests/real/quick_test.py - -Requires: pyright-langserver installed (npm install -g pyright) -""" - -import asyncio -import shutil -import sys -from pathlib import Path - -# Add source to path -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -from codexlens.lsp.lsp_bridge import LspBridge -from codexlens.lsp.lsp_graph_builder import LspGraphBuilder -from codexlens.hybrid_search.data_structures import CodeSymbolNode, Range - - -# Test file - the LSP bridge source itself -TEST_FILE = Path(__file__).parent.parent.parent / "src" / "codexlens" / "lsp" / "lsp_bridge.py" -WORKSPACE_ROOT = Path(__file__).parent.parent.parent # codex-lens root - - -def check_pyright(): - """Check if pyright-langserver is available.""" - return shutil.which("pyright-langserver") is not None - - -async def test_get_definition(): - """Test get_definition.""" - print("\n" + "=" * 60) - print("TEST: get_definition") - print("=" * 60) - - symbol = CodeSymbolNode( - id=f"{TEST_FILE}:LspBridge:96", - name="LspBridge", - kind="class", - file_path=str(TEST_FILE), - range=Range(start_line=96, start_character=6, end_line=96, end_character=15), - ) - - print(f"Symbol: {symbol.name}") - print(f"File: {symbol.file_path}") - print(f"Position: line {symbol.range.start_line}, char {symbol.range.start_character}") - - async with LspBridge(workspace_root=str(WORKSPACE_ROOT), timeout=30.0) as bridge: - result = await bridge.get_definition(symbol) - - if result: - print(f"\n[OK] SUCCESS: Definition found at {result.file_path}:{result.line}") - else: - print(f"\n[WARN] No definition found (may be expected for class declaration)") - - return result is not None - - -async def test_get_references(): - """Test get_references.""" - print("\n" + "=" * 60) - print("TEST: get_references") - print("=" * 60) - - symbol = CodeSymbolNode( - id=f"{TEST_FILE}:get_references:200", - name="get_references", - kind="method", - file_path=str(TEST_FILE), - range=Range(start_line=200, start_character=10, end_line=200, end_character=24), - ) - - print(f"Symbol: {symbol.name}") - print(f"File: {Path(symbol.file_path).name}") - print(f"Position: line {symbol.range.start_line}") - - async with LspBridge(workspace_root=str(WORKSPACE_ROOT), timeout=30.0) as bridge: - refs = await bridge.get_references(symbol) - - print(f"\n[OK] Found {len(refs)} references:") - for i, ref in enumerate(refs[:10]): - print(f" [{i+1}] {Path(ref.file_path).name}:{ref.line}") - if len(refs) > 10: - print(f" ... and {len(refs) - 10} more") - - return len(refs) >= 0 - - -async def test_get_hover(): - """Test get_hover.""" - print("\n" + "=" * 60) - print("TEST: get_hover") - print("=" * 60) - - symbol = CodeSymbolNode( - id=f"{TEST_FILE}:LspBridge:96", - name="LspBridge", - kind="class", - file_path=str(TEST_FILE), - range=Range(start_line=96, start_character=6, end_line=96, end_character=15), - ) - - print(f"Symbol: {symbol.name}") - - async with LspBridge(workspace_root=str(WORKSPACE_ROOT), timeout=30.0) as bridge: - hover = await bridge.get_hover(symbol) - - if hover: - preview = hover[:300].replace('\n', '\n ') - print(f"\n[OK] Hover info ({len(hover)} chars):") - print(f" {preview}...") - else: - print(f"\n[WARN] No hover info available") - - return hover is not None - - -async def test_get_document_symbols(): - """Test get_document_symbols.""" - print("\n" + "=" * 60) - print("TEST: get_document_symbols") - print("=" * 60) - - file_path = str(TEST_FILE) - print(f"File: {Path(file_path).name}") - - async with LspBridge(workspace_root=str(WORKSPACE_ROOT), timeout=30.0) as bridge: - symbols = await bridge.get_document_symbols(file_path) - - print(f"\n[OK] Found {len(symbols)} symbols:") - - # Group by kind - by_kind = {} - for sym in symbols: - kind = sym.get("kind", "unknown") - by_kind[kind] = by_kind.get(kind, 0) + 1 - - for kind, count in sorted(by_kind.items()): - print(f" {kind}: {count}") - - print("\nSample symbols:") - for sym in symbols[:15]: - name = sym.get("name", "?") - kind = sym.get("kind", "?") - range_data = sym.get("range", {}) - start = range_data.get("start", {}) - line = start.get("line", 0) + 1 - print(f" - {name} ({kind}) at line {line}") - - return len(symbols) > 0 - - -async def test_graph_expansion(): - """Test graph expansion.""" - print("\n" + "=" * 60) - print("TEST: Graph Expansion (LspGraphBuilder)") - print("=" * 60) - - seed = CodeSymbolNode( - id=f"{TEST_FILE}:LspBridge:96", - name="LspBridge", - kind="class", - file_path=str(TEST_FILE), - range=Range(start_line=96, start_character=6, end_line=96, end_character=15), - ) - - print(f"Seed: {seed.name} in {Path(seed.file_path).name}:{seed.range.start_line}") - print("Settings: max_depth=1, max_nodes=20") - - builder = LspGraphBuilder(max_depth=1, max_nodes=20) - - async with LspBridge(workspace_root=str(WORKSPACE_ROOT), timeout=30.0) as bridge: - graph = await builder.build_from_seeds([seed], bridge) - - print(f"\n[OK] Graph expansion complete:") - print(f" Nodes: {len(graph.nodes)}") - print(f" Edges: {len(graph.edges)}") - - if graph.nodes: - print("\nNodes found:") - for node_id, node in list(graph.nodes.items())[:15]: - print(f" - {node.name} ({node.kind}) in {Path(node.file_path).name}:{node.range.start_line}") - - if graph.edges: - print(f"\nEdges (first 10):") - for edge in list(graph.edges)[:10]: - src = graph.nodes.get(edge.source_id) - tgt = graph.nodes.get(edge.target_id) - src_name = src.name if src else edge.source_id[:20] - tgt_name = tgt.name if tgt else edge.target_id[:20] - print(f" - {src_name} --[{edge.relation}]--> {tgt_name}") - - return len(graph.nodes) >= 1 - - -async def test_cache_performance(): - """Test cache performance.""" - print("\n" + "=" * 60) - print("TEST: Cache Performance") - print("=" * 60) - - symbol = CodeSymbolNode( - id=f"{TEST_FILE}:LspBridge:96", - name="LspBridge", - kind="class", - file_path=str(TEST_FILE), - range=Range(start_line=96, start_character=6, end_line=96, end_character=15), - ) - - import time - - async with LspBridge(workspace_root=str(WORKSPACE_ROOT), timeout=30.0) as bridge: - # First call - cache miss - start = time.perf_counter() - await bridge.get_references(symbol) - first_time = (time.perf_counter() - start) * 1000 - - # Second call - cache hit - start = time.perf_counter() - await bridge.get_references(symbol) - second_time = (time.perf_counter() - start) * 1000 - - print(f"\nFirst call (cache miss): {first_time:.2f}ms") - print(f"Second call (cache hit): {second_time:.2f}ms") - print(f"Speedup: {first_time/max(second_time, 0.001):.1f}x") - print(f"Cache entries: {len(bridge.cache)}") - - if second_time < first_time: - print("\n[OK] Cache is working correctly") - else: - print("\n[WARN] Cache may not be effective") - - return second_time < first_time - - -async def run_all_tests(): - """Run all tests.""" - print("=" * 60) - print("CODEX-LENS LSP REAL INTERFACE TESTS (Standalone Mode)") - print("=" * 60) - print(f"Test file: {TEST_FILE}") - print(f"Workspace: {WORKSPACE_ROOT}") - print(f"Mode: Standalone (direct language server communication)") - - results = {} - - tests = [ - ("get_definition", test_get_definition), - ("get_references", test_get_references), - ("get_hover", test_get_hover), - ("get_document_symbols", test_get_document_symbols), - ("graph_expansion", test_graph_expansion), - ("cache_performance", test_cache_performance), - ] - - for name, test_fn in tests: - try: - results[name] = await test_fn() - except Exception as e: - print(f"\n[FAIL] FAILED: {e}") - import traceback - traceback.print_exc() - results[name] = False - - # Summary - print("\n" + "=" * 60) - print("SUMMARY") - print("=" * 60) - - passed = sum(1 for v in results.values() if v) - total = len(results) - - for name, result in results.items(): - status = "[PASS]" if result else "[FAIL]" - print(f" {status}: {name}") - - print(f"\nResult: {passed}/{total} tests passed") - - return passed == total - - -def main(): - """Main entry point.""" - print("Checking pyright-langserver availability...") - - if not check_pyright(): - print("\n" + "=" * 60) - print("ERROR: pyright-langserver not available") - print("=" * 60) - print() - print("To run these tests:") - print(" 1. Install pyright: npm install -g pyright") - print(" 2. Verify: pyright-langserver --version") - print(" 3. Run this script again") - print() - sys.exit(1) - - print("[OK] pyright-langserver is available!") - print() - - # Run tests - # Note: On Windows, we use the default ProactorEventLoop (not SelectorEventLoop) - # because ProactorEventLoop supports subprocess creation which is required for LSP - - success = asyncio.run(run_all_tests()) - sys.exit(0 if success else 1) - - -if __name__ == "__main__": - main() diff --git a/codex-lens/tests/real/test_lsp_real_interface.py b/codex-lens/tests/real/test_lsp_real_interface.py deleted file mode 100644 index 587d3f90..00000000 --- a/codex-lens/tests/real/test_lsp_real_interface.py +++ /dev/null @@ -1,424 +0,0 @@ -"""Real interface tests for LSP Bridge using Standalone Mode. - -These tests require: -1. Language servers installed (pyright-langserver, typescript-language-server) -2. A Python/TypeScript project in the workspace - -Run with: pytest tests/real/ -v -s -""" - -import asyncio -import os -import sys -import pytest -from pathlib import Path - -# Add source to path -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) - -from codexlens.lsp.lsp_bridge import LspBridge, Location, HAS_AIOHTTP -from codexlens.lsp.lsp_graph_builder import LspGraphBuilder -from codexlens.hybrid_search.data_structures import CodeSymbolNode, Range - - -# Test configuration - adjust these paths to match your setup -TEST_PYTHON_FILE = Path(__file__).parent.parent.parent / "src" / "codexlens" / "lsp" / "lsp_bridge.py" -TEST_TYPESCRIPT_FILE = Path(__file__).parent.parent.parent.parent / "ccw-vscode-bridge" / "src" / "extension.ts" - -WORKSPACE_ROOT = Path(__file__).parent.parent.parent # codex-lens root - - -def is_pyright_available() -> bool: - """Check if pyright-langserver is installed.""" - import shutil - return shutil.which("pyright-langserver") is not None - - -def is_typescript_server_available() -> bool: - """Check if typescript-language-server is installed.""" - import shutil - return shutil.which("typescript-language-server") is not None - - -# Skip all tests if pyright not available -pytestmark = pytest.mark.skipif( - not is_pyright_available(), - reason="pyright-langserver not installed. Install with: npm install -g pyright" -) - - -class TestRealLspBridgeStandalone: - """Real interface tests for LspBridge in Standalone Mode.""" - - @pytest.fixture - def bridge(self): - """Create real LspBridge instance in standalone mode.""" - return LspBridge( - workspace_root=str(WORKSPACE_ROOT), - timeout=30.0, - use_vscode_bridge=False, # Use standalone mode - ) - - @pytest.fixture - def python_symbol(self): - """Create a symbol pointing to LspBridge class.""" - return CodeSymbolNode( - id=f"{TEST_PYTHON_FILE}:LspBridge:96", - name="LspBridge", - kind="class", - file_path=str(TEST_PYTHON_FILE), - range=Range(start_line=96, start_character=6, end_line=96, end_character=15), - ) - - @pytest.fixture - def python_method_symbol(self): - """Create a symbol pointing to get_references method.""" - return CodeSymbolNode( - id=f"{TEST_PYTHON_FILE}:get_references:200", - name="get_references", - kind="method", - file_path=str(TEST_PYTHON_FILE), - range=Range(start_line=200, start_character=10, end_line=200, end_character=24), - ) - - @pytest.mark.asyncio - async def test_real_get_definition(self, bridge, python_symbol): - """Test get_definition against real Python file.""" - print(f"\n>>> Testing get_definition for {python_symbol.name}") - print(f" File: {python_symbol.file_path}") - print(f" Position: line {python_symbol.range.start_line}, char {python_symbol.range.start_character}") - - async with bridge: - definition = await bridge.get_definition(python_symbol) - - print(f" Result: {definition}") - - # Definition should exist (class definition) - if definition: - print(f" ✓ Found definition at {definition.file_path}:{definition.line}") - assert definition.file_path.endswith(".py") - assert definition.line > 0 - else: - print(" ⚠ No definition found (may be expected for class declarations)") - - @pytest.mark.asyncio - async def test_real_get_references(self, bridge, python_method_symbol): - """Test get_references against real Python file.""" - print(f"\n>>> Testing get_references for {python_method_symbol.name}") - print(f" File: {python_method_symbol.file_path}") - print(f" Position: line {python_method_symbol.range.start_line}") - - async with bridge: - refs = await bridge.get_references(python_method_symbol) - - print(f" Found {len(refs)} references:") - for i, ref in enumerate(refs[:5]): # Show first 5 - print(f" [{i+1}] {Path(ref.file_path).name}:{ref.line}") - if len(refs) > 5: - print(f" ... and {len(refs) - 5} more") - - # Should find at least the definition itself - assert len(refs) >= 0, "References query should succeed (may be empty)" - - @pytest.mark.asyncio - async def test_real_get_hover(self, bridge, python_symbol): - """Test get_hover against real Python file.""" - print(f"\n>>> Testing get_hover for {python_symbol.name}") - - async with bridge: - hover = await bridge.get_hover(python_symbol) - - if hover: - print(f" ✓ Hover info ({len(hover)} chars):") - preview = hover[:200].replace('\n', '\\n') - print(f" {preview}...") - assert len(hover) > 0 - else: - print(" ⚠ No hover info available") - - @pytest.mark.asyncio - async def test_real_get_document_symbols(self, bridge): - """Test get_document_symbols against real Python file.""" - file_path = str(TEST_PYTHON_FILE) - print(f"\n>>> Testing get_document_symbols") - print(f" File: {file_path}") - - async with bridge: - symbols = await bridge.get_document_symbols(file_path) - - print(f" Found {len(symbols)} symbols:") - - # Group by kind - by_kind = {} - for sym in symbols: - kind = sym.get("kind", "unknown") - by_kind[kind] = by_kind.get(kind, 0) + 1 - - for kind, count in sorted(by_kind.items()): - print(f" {kind}: {count}") - - # Show some sample symbols - print(" Sample symbols:") - for sym in symbols[:10]: - name = sym.get("name", "?") - kind = sym.get("kind", "?") - range_data = sym.get("range", {}) - start = range_data.get("start", {}) - line = start.get("line", 0) + 1 - print(f" - {name} ({kind}) at line {line}") - - assert len(symbols) > 0, "Should find symbols in Python file" - - @pytest.mark.asyncio - async def test_real_get_call_hierarchy(self, bridge, python_method_symbol): - """Test get_call_hierarchy against real Python file.""" - print(f"\n>>> Testing get_call_hierarchy for {python_method_symbol.name}") - - async with bridge: - calls = await bridge.get_call_hierarchy(python_method_symbol) - - print(f" Found {len(calls)} call hierarchy items:") - for i, call in enumerate(calls[:10]): - print(f" [{i+1}] {call.name} in {Path(call.file_path).name}:{call.range.start_line}") - - # May be empty if call hierarchy not supported or no callers - print(f" ✓ Call hierarchy query completed") - - @pytest.mark.asyncio - async def test_real_cache_behavior(self, bridge, python_symbol): - """Test that cache actually works with real requests.""" - print(f"\n>>> Testing cache behavior") - - async with bridge: - # First call - should hit language server - print(" First call (cache miss expected)...") - refs1 = await bridge.get_references(python_symbol) - cache_size_after_first = len(bridge.cache) - print(f" Cache size after first call: {cache_size_after_first}") - - # Second call - should hit cache - print(" Second call (cache hit expected)...") - refs2 = await bridge.get_references(python_symbol) - cache_size_after_second = len(bridge.cache) - print(f" Cache size after second call: {cache_size_after_second}") - - assert cache_size_after_first > 0, "Cache should have entries after first call" - assert cache_size_after_second == cache_size_after_first, "Cache size should not change on hit" - assert refs1 == refs2, "Results should be identical" - print(" ✓ Cache working correctly") - - -class TestRealLspGraphBuilderStandalone: - """Real interface tests for LspGraphBuilder with Standalone Mode.""" - - @pytest.fixture - def seed_node(self): - """Create a seed node for graph expansion.""" - return CodeSymbolNode( - id=f"{TEST_PYTHON_FILE}:LspBridge:96", - name="LspBridge", - kind="class", - file_path=str(TEST_PYTHON_FILE), - range=Range(start_line=96, start_character=6, end_line=96, end_character=15), - ) - - @pytest.mark.asyncio - async def test_real_graph_expansion(self, seed_node): - """Test real graph expansion from a Python class.""" - print(f"\n>>> Testing graph expansion from {seed_node.name}") - print(f" Seed: {seed_node.file_path}:{seed_node.range.start_line}") - - builder = LspGraphBuilder(max_depth=1, max_nodes=20) - - async with LspBridge( - workspace_root=str(WORKSPACE_ROOT), - timeout=30.0, - ) as bridge: - graph = await builder.build_from_seeds([seed_node], bridge) - - print(f" Graph results:") - print(f" Nodes: {len(graph.nodes)}") - print(f" Edges: {len(graph.edges)}") - - if graph.nodes: - print(f" Node details:") - for node_id, node in list(graph.nodes.items())[:10]: - print(f" - {node.name} ({node.kind}) in {Path(node.file_path).name}:{node.range.start_line}") - - if graph.edges: - print(f" Edge details:") - for edge in list(graph.edges)[:10]: - print(f" - {edge.source_id[:30]}... --[{edge.relation}]--> {edge.target_id[:30]}...") - - # We should have at least the seed node - assert len(graph.nodes) >= 1, "Graph should contain at least the seed node" - print(" ✓ Graph expansion completed") - - @pytest.mark.asyncio - async def test_real_multi_seed_expansion(self): - """Test graph expansion from multiple seeds.""" - print(f"\n>>> Testing multi-seed graph expansion") - - seeds = [ - CodeSymbolNode( - id=f"{TEST_PYTHON_FILE}:Location:35", - name="Location", - kind="class", - file_path=str(TEST_PYTHON_FILE), - range=Range(start_line=35, start_character=6, end_line=35, end_character=14), - ), - CodeSymbolNode( - id=f"{TEST_PYTHON_FILE}:CacheEntry:81", - name="CacheEntry", - kind="class", - file_path=str(TEST_PYTHON_FILE), - range=Range(start_line=81, start_character=6, end_line=81, end_character=16), - ), - ] - - print(f" Seeds: {[s.name for s in seeds]}") - - builder = LspGraphBuilder(max_depth=1, max_nodes=30) - - async with LspBridge( - workspace_root=str(WORKSPACE_ROOT), - timeout=30.0, - ) as bridge: - graph = await builder.build_from_seeds(seeds, bridge) - - print(f" Graph results:") - print(f" Nodes: {len(graph.nodes)}") - print(f" Edges: {len(graph.edges)}") - - # Should have at least the seed nodes - assert len(graph.nodes) >= len(seeds), f"Graph should contain at least {len(seeds)} seed nodes" - print(" ✓ Multi-seed expansion completed") - - -class TestRealHybridSearchIntegrationStandalone: - """Real integration tests with HybridSearchEngine.""" - - @pytest.mark.asyncio - async def test_real_lsp_search_pipeline(self): - """Test the full LSP search pipeline with real LSP.""" - print(f"\n>>> Testing full LSP search pipeline") - - # Create mock seeds (normally from vector/FTS search) - seeds = [ - CodeSymbolNode( - id=f"{TEST_PYTHON_FILE}:LspBridge:96", - name="LspBridge", - kind="class", - file_path=str(TEST_PYTHON_FILE), - range=Range(start_line=96, start_character=6, end_line=96, end_character=15), - ), - ] - - print(f" Starting with {len(seeds)} seed(s)") - - builder = LspGraphBuilder(max_depth=2, max_nodes=50) - - async with LspBridge( - workspace_root=str(WORKSPACE_ROOT), - timeout=30.0, - ) as bridge: - graph = await builder.build_from_seeds(seeds, bridge) - - print(f" Expanded to {len(graph.nodes)} nodes") - - # Simulate conversion to SearchResult format - results = [] - for node_id, node in graph.nodes.items(): - if node.id not in [s.id for s in seeds]: # Exclude seeds - results.append({ - "path": node.file_path, - "symbol_name": node.name, - "symbol_kind": node.kind, - "start_line": node.range.start_line, - "end_line": node.range.end_line, - }) - - print(f" Generated {len(results)} search results (excluding seeds)") - - if results: - print(" Sample results:") - for r in results[:5]: - print(f" - {r['symbol_name']} ({r['symbol_kind']}) at {Path(r['path']).name}:{r['start_line']}") - - print(" ✓ Full pipeline completed") - - -# TypeScript tests (if available) -@pytest.mark.skipif( - not is_typescript_server_available() or not TEST_TYPESCRIPT_FILE.exists(), - reason="TypeScript language server or test file not available" -) -class TestRealTypescriptLspStandalone: - """Real tests against TypeScript files.""" - - @pytest.fixture - def ts_symbol(self): - """Create a symbol in the TypeScript extension file.""" - return CodeSymbolNode( - id=f"{TEST_TYPESCRIPT_FILE}:activate:12", - name="activate", - kind="function", - file_path=str(TEST_TYPESCRIPT_FILE), - range=Range(start_line=12, start_character=16, end_line=12, end_character=24), - ) - - @pytest.mark.asyncio - async def test_real_typescript_definition(self, ts_symbol): - """Test LSP definition lookup in TypeScript.""" - print(f"\n>>> Testing TypeScript definition for {ts_symbol.name}") - - async with LspBridge( - workspace_root=str(TEST_TYPESCRIPT_FILE.parent.parent), - timeout=30.0, - ) as bridge: - definition = await bridge.get_definition(ts_symbol) - - if definition: - print(f" ✓ Found: {definition.file_path}:{definition.line}") - else: - print(" ⚠ No definition found (TypeScript LSP may not be active)") - - @pytest.mark.asyncio - async def test_real_typescript_document_symbols(self): - """Test document symbols in TypeScript.""" - print(f"\n>>> Testing TypeScript document symbols") - - async with LspBridge( - workspace_root=str(TEST_TYPESCRIPT_FILE.parent.parent), - timeout=30.0, - ) as bridge: - symbols = await bridge.get_document_symbols(str(TEST_TYPESCRIPT_FILE)) - - print(f" Found {len(symbols)} symbols") - for sym in symbols[:5]: - print(f" - {sym.get('name')} ({sym.get('kind')})") - - # TypeScript files should have symbols - if symbols: - print(" ✓ TypeScript symbols retrieved") - else: - print(" ⚠ No symbols found (TypeScript LSP may not be active)") - - -if __name__ == "__main__": - # Allow running directly - if is_pyright_available(): - print("Pyright language server is available") - print("Running tests...") - pytest.main([__file__, "-v", "-s"]) - else: - print("=" * 60) - print("Pyright language server NOT available") - print("=" * 60) - print() - print("To run these tests:") - print("1. Install pyright: npm install -g pyright") - print("2. Install typescript-language-server: npm install -g typescript-language-server") - print("3. Run: pytest tests/real/ -v -s") - print() - sys.exit(1) diff --git a/codex-lens/tests/simple_validation.py b/codex-lens/tests/simple_validation.py deleted file mode 100644 index 5d881bba..00000000 --- a/codex-lens/tests/simple_validation.py +++ /dev/null @@ -1,218 +0,0 @@ -""" -Simple validation for performance optimizations (Windows-safe). -""" -import sys -sys.stdout.reconfigure(encoding='utf-8') - -import json -import sqlite3 -import tempfile -import time -from pathlib import Path - -from codexlens.storage.dir_index import DirIndexStore -from codexlens.storage.registry import RegistryStore - - -def main(): - print("=" * 60) - print("CodexLens Performance Optimizations - Simple Validation") - print("=" * 60) - - # Test 1: Keyword Normalization - print("\n[1/4] Testing Keyword Normalization...") - try: - tmpdir = tempfile.mkdtemp() - db_path = Path(tmpdir) / "test1.db" - - store = DirIndexStore(db_path) - store.initialize() - - file_id = store.add_file( - name="test.py", - full_path=Path(f"{tmpdir}/test.py"), - content="def hello(): pass", - language="python" - ) - - keywords = ["auth", "security", "jwt"] - store.add_semantic_metadata( - file_id=file_id, - summary="Test", - keywords=keywords, - purpose="Testing", - llm_tool="gemini" - ) - - # Check normalized tables - conn = store._get_connection() - count = conn.execute( - "SELECT COUNT(*) as c FROM file_keywords WHERE file_id=?", - (file_id,) - ).fetchone()["c"] - - store.close() - - assert count == 3, f"Expected 3 keywords, got {count}" - print(" PASS: Keywords stored in normalized tables") - - # Test optimized search - store = DirIndexStore(db_path) - results = store.search_semantic_keywords("auth", use_normalized=True) - store.close() - - assert len(results) == 1 - print(" PASS: Optimized keyword search works") - - except Exception as e: - import traceback - print(f" FAIL: {e}") - traceback.print_exc() - return 1 - - # Test 2: Path Lookup Optimization - print("\n[2/4] Testing Path Lookup Optimization...") - try: - tmpdir = tempfile.mkdtemp() - db_path = Path(tmpdir) / "test2.db" - - store = RegistryStore(db_path) - store.initialize() # Create schema - - # Register a project first - project = store.register_project( - source_root=Path("/a"), - index_root=Path("/tmp") - ) - - # Register directory - store.register_dir( - project_id=project.id, - source_path=Path("/a/b/c"), - index_path=Path("/tmp/index.db"), - depth=2, - files_count=0 - ) - - deep_path = Path("/a/b/c/d/e/f/g/h/i/j/file.py") - - start = time.perf_counter() - result = store.find_nearest_index(deep_path) - elapsed = time.perf_counter() - start - - store.close() - - assert result is not None, "No result found" - # Path is normalized, just check it contains the key parts - assert "a" in str(result.source_path) and "b" in str(result.source_path) and "c" in str(result.source_path) - assert elapsed < 0.05, f"Too slow: {elapsed*1000:.2f}ms" - - print(f" PASS: Found nearest index in {elapsed*1000:.2f}ms") - - except Exception as e: - import traceback - print(f" FAIL: {e}") - traceback.print_exc() - return 1 - - # Test 3: Symbol Search Prefix Mode - print("\n[3/4] Testing Symbol Search Prefix Mode...") - try: - tmpdir = tempfile.mkdtemp() - db_path = Path(tmpdir) / "test3.db" - - store = DirIndexStore(db_path) - store.initialize() - - from codexlens.entities import Symbol - file_id = store.add_file( - name="test.py", - full_path=Path(f"{tmpdir}/test.py"), - content="def hello(): pass\n" * 10, - language="python", - symbols=[ - Symbol(name="get_user", kind="function", range=(1, 5)), - Symbol(name="get_item", kind="function", range=(6, 10)), - Symbol(name="create_user", kind="function", range=(11, 15)), - ] - ) - - # Prefix search - results = store.search_symbols("get", prefix_mode=True) - store.close() - - assert len(results) == 2, f"Expected 2, got {len(results)}" - for symbol in results: - assert symbol.name.startswith("get") - - print(f" PASS: Prefix search found {len(results)} symbols") - - except Exception as e: - import traceback - print(f" FAIL: {e}") - traceback.print_exc() - return 1 - - # Test 4: Performance Comparison - print("\n[4/4] Testing Performance Comparison...") - try: - tmpdir = tempfile.mkdtemp() - db_path = Path(tmpdir) / "test4.db" - - store = DirIndexStore(db_path) - store.initialize() - - # Create 50 files with keywords - for i in range(50): - file_id = store.add_file( - name=f"file_{i}.py", - full_path=Path(f"{tmpdir}/file_{i}.py"), - content=f"def function_{i}(): pass", - language="python" - ) - - keywords = ["auth", "security"] if i % 2 == 0 else ["api", "endpoint"] - store.add_semantic_metadata( - file_id=file_id, - summary=f"File {i}", - keywords=keywords, - purpose="Testing", - llm_tool="gemini" - ) - - # Benchmark normalized - start = time.perf_counter() - for _ in range(5): - results_norm = store.search_semantic_keywords("auth", use_normalized=True) - norm_time = time.perf_counter() - start - - # Benchmark fallback - start = time.perf_counter() - for _ in range(5): - results_fallback = store.search_semantic_keywords("auth", use_normalized=False) - fallback_time = time.perf_counter() - start - - store.close() - - assert len(results_norm) == len(results_fallback) - speedup = fallback_time / norm_time if norm_time > 0 else 1.0 - - print(f" Normalized: {norm_time*1000:.2f}ms (5 iterations)") - print(f" Fallback: {fallback_time*1000:.2f}ms (5 iterations)") - print(f" Speedup: {speedup:.2f}x") - print(" PASS: Performance test completed") - - except Exception as e: - import traceback - print(f" FAIL: {e}") - traceback.print_exc() - return 1 - - print("\n" + "=" * 60) - print("ALL VALIDATION TESTS PASSED") - print("=" * 60) - return 0 - - -if __name__ == "__main__": - exit(main()) diff --git a/codex-lens/tests/test_ann_index.py b/codex-lens/tests/test_ann_index.py deleted file mode 100644 index 964f7a1a..00000000 --- a/codex-lens/tests/test_ann_index.py +++ /dev/null @@ -1,760 +0,0 @@ -"""Tests for ANN (Approximate Nearest Neighbor) index using HNSW.""" - -import tempfile -from pathlib import Path -from unittest.mock import patch - -import pytest - -# Skip all tests if semantic dependencies not available -pytest.importorskip("numpy") - - -def _hnswlib_available() -> bool: - """Check if hnswlib is available.""" - try: - import hnswlib - return True - except ImportError: - return False - - -class TestANNIndex: - """Test suite for ANNIndex class.""" - - @pytest.fixture - def temp_db(self): - """Create a temporary database file.""" - with tempfile.TemporaryDirectory() as tmpdir: - yield Path(tmpdir) / "_index.db" - - @pytest.fixture - def sample_vectors(self): - """Generate sample vectors for testing.""" - import numpy as np - np.random.seed(42) - # 100 vectors of dimension 384 (matches fast model) - return np.random.randn(100, 384).astype(np.float32) - - @pytest.fixture - def sample_ids(self): - """Generate sample IDs.""" - return list(range(1, 101)) - - def test_import_check(self): - """Test that HNSWLIB_AVAILABLE flag is set correctly.""" - try: - from codexlens.semantic.ann_index import HNSWLIB_AVAILABLE - # Should be True if hnswlib is installed, False otherwise - assert isinstance(HNSWLIB_AVAILABLE, bool) - except ImportError: - pytest.skip("ann_index module not available") - - @pytest.mark.skipif( - not _hnswlib_available(), - reason="hnswlib not installed" - ) - def test_create_index(self, temp_db): - """Test creating a new ANN index.""" - from codexlens.semantic.ann_index import ANNIndex - - index = ANNIndex(temp_db, dim=384) - assert index.dim == 384 - assert index.count() == 0 - assert not index.is_loaded - - @pytest.mark.skipif( - not _hnswlib_available(), - reason="hnswlib not installed" - ) - def test_add_vectors(self, temp_db, sample_vectors, sample_ids): - """Test adding vectors to the index.""" - from codexlens.semantic.ann_index import ANNIndex - - index = ANNIndex(temp_db, dim=384) - index.add_vectors(sample_ids, sample_vectors) - - assert index.count() == 100 - assert index.is_loaded - - @pytest.mark.skipif( - not _hnswlib_available(), - reason="hnswlib not installed" - ) - def test_search(self, temp_db, sample_vectors, sample_ids): - """Test searching for similar vectors.""" - from codexlens.semantic.ann_index import ANNIndex - - index = ANNIndex(temp_db, dim=384) - index.add_vectors(sample_ids, sample_vectors) - - # Search for the first vector - should find itself - query = sample_vectors[0] - ids, distances = index.search(query, top_k=5) - - assert len(ids) == 5 - assert len(distances) == 5 - # First result should be the query vector itself (or very close) - assert ids[0] == 1 # ID of first vector - assert distances[0] < 0.01 # Very small distance (almost identical) - - @pytest.mark.skipif( - not _hnswlib_available(), - reason="hnswlib not installed" - ) - def test_search_clamps_top_k_to_available_vectors(self, temp_db, sample_vectors, sample_ids): - """Search should clamp top_k to the loaded vector count.""" - from codexlens.semantic.ann_index import ANNIndex - - index = ANNIndex(temp_db, dim=384) - index.add_vectors(sample_ids[:3], sample_vectors[:3]) - - ids, distances = index.search(sample_vectors[0], top_k=10) - - assert len(ids) == 3 - assert len(distances) == 3 - assert ids[0] == 1 - - @pytest.mark.skipif( - not _hnswlib_available(), - reason="hnswlib not installed" - ) - def test_save_and_load(self, temp_db, sample_vectors, sample_ids): - """Test saving and loading index from disk.""" - from codexlens.semantic.ann_index import ANNIndex - - # Create and save index - index1 = ANNIndex(temp_db, dim=384) - index1.add_vectors(sample_ids, sample_vectors) - index1.save() - - # Check that file was created (new naming: {db_stem}_vectors.hnsw) - hnsw_path = temp_db.parent / f"{temp_db.stem}_vectors.hnsw" - assert hnsw_path.exists() - - # Load in new instance - index2 = ANNIndex(temp_db, dim=384) - loaded = index2.load() - - assert loaded is True - assert index2.count() == 100 - assert index2.is_loaded - - # Verify search still works - query = sample_vectors[0] - ids, distances = index2.search(query, top_k=5) - assert ids[0] == 1 - - @pytest.mark.skipif( - not _hnswlib_available(), - reason="hnswlib not installed" - ) - def test_load_nonexistent(self, temp_db): - """Test loading when index file doesn't exist.""" - from codexlens.semantic.ann_index import ANNIndex - - index = ANNIndex(temp_db, dim=384) - loaded = index.load() - - assert loaded is False - assert not index.is_loaded - - @pytest.mark.skipif( - not _hnswlib_available(), - reason="hnswlib not installed" - ) - def test_remove_vectors(self, temp_db, sample_vectors, sample_ids): - """Test removing vectors from the index.""" - from codexlens.semantic.ann_index import ANNIndex - - index = ANNIndex(temp_db, dim=384) - index.add_vectors(sample_ids, sample_vectors) - - # Remove first 10 vectors - index.remove_vectors(list(range(1, 11))) - - # Search for removed vector - should not be in results - query = sample_vectors[0] - ids, distances = index.search(query, top_k=5) - - # ID 1 should not be in results (soft deleted) - assert 1 not in ids - - @pytest.mark.skipif( - not _hnswlib_available(), - reason="hnswlib not installed" - ) - def test_incremental_add(self, temp_db): - """Test adding vectors incrementally.""" - import numpy as np - from codexlens.semantic.ann_index import ANNIndex - - index = ANNIndex(temp_db, dim=384) - - # Add first batch - vectors1 = np.random.randn(50, 384).astype(np.float32) - index.add_vectors(list(range(1, 51)), vectors1) - assert index.count() == 50 - - # Add second batch - vectors2 = np.random.randn(50, 384).astype(np.float32) - index.add_vectors(list(range(51, 101)), vectors2) - assert index.count() == 100 - - @pytest.mark.skipif( - not _hnswlib_available(), - reason="hnswlib not installed" - ) - def test_search_empty_index(self, temp_db): - """Test searching an empty index.""" - import numpy as np - from codexlens.semantic.ann_index import ANNIndex - - index = ANNIndex(temp_db, dim=384) - query = np.random.randn(384).astype(np.float32) - - ids, distances = index.search(query, top_k=5) - - assert ids == [] - assert distances == [] - - @pytest.mark.skipif( - not _hnswlib_available(), - reason="hnswlib not installed" - ) - def test_invalid_dimension(self, temp_db, sample_vectors, sample_ids): - """Test adding vectors with wrong dimension.""" - import numpy as np - from codexlens.semantic.ann_index import ANNIndex - - index = ANNIndex(temp_db, dim=384) - - # Try to add vectors with wrong dimension - wrong_vectors = np.random.randn(10, 768).astype(np.float32) - with pytest.raises(ValueError, match="dimension"): - index.add_vectors(list(range(1, 11)), wrong_vectors) - - @pytest.mark.skipif( - not _hnswlib_available(), - reason="hnswlib not installed" - ) - def test_auto_resize(self, temp_db): - """Test that index automatically resizes when capacity is exceeded.""" - import numpy as np - from codexlens.semantic.ann_index import ANNIndex - - index = ANNIndex(temp_db, dim=384) - # Override initial capacity to test resize - index._max_elements = 100 - - # Add more vectors than initial capacity - vectors = np.random.randn(150, 384).astype(np.float32) - index.add_vectors(list(range(1, 151)), vectors) - - assert index.count() == 150 - assert index._max_elements >= 150 - - -class TestVectorStoreWithANN: - """Test VectorStore integration with ANN index.""" - - @pytest.fixture - def temp_db(self): - """Create a temporary database file.""" - with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdir: - yield Path(tmpdir) / "_index.db" - - @pytest.fixture - def sample_chunks(self): - """Create sample semantic chunks with embeddings.""" - import numpy as np - from codexlens.entities import SemanticChunk - - np.random.seed(42) - chunks = [] - for i in range(10): - chunk = SemanticChunk( - content=f"def function_{i}(): pass", - metadata={"symbol_name": f"function_{i}", "symbol_kind": "function"}, - ) - chunk.embedding = np.random.randn(384).astype(np.float32).tolist() - chunks.append(chunk) - return chunks - - def test_vector_store_with_ann(self, temp_db, sample_chunks): - """Test VectorStore using ANN index for search.""" - from codexlens.semantic.vector_store import VectorStore, HNSWLIB_AVAILABLE - - store = VectorStore(temp_db) - - # Add chunks - ids = store.add_chunks(sample_chunks, "test.py") - assert len(ids) == 10 - - # Check ANN status - if HNSWLIB_AVAILABLE: - assert store.ann_available or store.ann_count >= 0 - - # Search - query_embedding = sample_chunks[0].embedding - results = store.search_similar(query_embedding, top_k=5) - - assert len(results) <= 5 - if results: - # First result should have high similarity - assert results[0].score > 0.9 - - def test_vector_store_rebuild_ann(self, temp_db, sample_chunks): - """Test rebuilding ANN index from SQLite data.""" - from codexlens.semantic.vector_store import VectorStore, HNSWLIB_AVAILABLE - - if not HNSWLIB_AVAILABLE: - pytest.skip("hnswlib not installed") - - store = VectorStore(temp_db) - - # Add chunks - store.add_chunks(sample_chunks, "test.py") - - # Rebuild ANN index - count = store.rebuild_ann_index() - assert count == 10 - - # Verify search works - query_embedding = sample_chunks[0].embedding - results = store.search_similar(query_embedding, top_k=5) - assert len(results) > 0 - - def test_vector_store_delete_updates_ann(self, temp_db, sample_chunks): - """Test that deleting chunks updates ANN index.""" - from codexlens.semantic.vector_store import VectorStore, HNSWLIB_AVAILABLE - - if not HNSWLIB_AVAILABLE: - pytest.skip("hnswlib not installed") - - store = VectorStore(temp_db) - - # Add chunks for two files - store.add_chunks(sample_chunks[:5], "file1.py") - store.add_chunks(sample_chunks[5:], "file2.py") - - initial_count = store.count_chunks() - assert initial_count == 10 - - # Delete one file's chunks - deleted = store.delete_file_chunks("file1.py") - assert deleted == 5 - - # Verify count - assert store.count_chunks() == 5 - - def test_vector_store_batch_add(self, temp_db, sample_chunks): - """Test batch adding chunks from multiple files.""" - from codexlens.semantic.vector_store import VectorStore - - store = VectorStore(temp_db) - - # Prepare chunks with paths - chunks_with_paths = [ - (chunk, f"file{i % 3}.py") - for i, chunk in enumerate(sample_chunks) - ] - - # Batch add - ids = store.add_chunks_batch(chunks_with_paths) - assert len(ids) == 10 - - # Verify - assert store.count_chunks() == 10 - - def test_vector_store_fallback_search(self, temp_db, sample_chunks): - """Test that search falls back to brute-force when ANN unavailable.""" - from codexlens.semantic.vector_store import VectorStore - - store = VectorStore(temp_db) - store.add_chunks(sample_chunks, "test.py") - - # Force disable ANN - store._ann_index = None - - # Search should still work (brute-force fallback) - query_embedding = sample_chunks[0].embedding - results = store.search_similar(query_embedding, top_k=5) - - assert len(results) > 0 - assert results[0].score > 0.9 - - -class TestSearchAccuracy: - """Test search accuracy comparing ANN vs brute-force.""" - - @pytest.fixture - def temp_db(self): - """Create a temporary database file.""" - with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdir: - yield Path(tmpdir) / "_index.db" - - @pytest.mark.skipif( - not _hnswlib_available(), - reason="hnswlib not installed" - ) - def test_ann_vs_brute_force_recall(self, temp_db): - """Test that ANN search has high recall compared to brute-force.""" - import numpy as np - from codexlens.entities import SemanticChunk - from codexlens.semantic.vector_store import VectorStore - - np.random.seed(42) - - # Create larger dataset - chunks = [] - for i in range(100): - chunk = SemanticChunk( - content=f"code block {i}", - metadata={"chunk_id": i}, - ) - chunk.embedding = np.random.randn(384).astype(np.float32).tolist() - chunks.append(chunk) - - store = VectorStore(temp_db) - store.add_chunks(chunks, "test.py") - - # Get brute-force results - store._ann_index = None # Force brute-force - store._invalidate_cache() # Clear cache to force refresh - query = chunks[0].embedding - bf_results = store.search_similar(query, top_k=10) - # Use chunk_id from metadata for comparison (more reliable than path+score) - bf_chunk_ids = {r.metadata.get("chunk_id") for r in bf_results} - - # Rebuild ANN and get ANN results - store.rebuild_ann_index() - ann_results = store.search_similar(query, top_k=10) - ann_chunk_ids = {r.metadata.get("chunk_id") for r in ann_results} - - # Calculate recall (how many brute-force results are in ANN results) - # ANN should find at least 80% of the same results - overlap = len(bf_chunk_ids & ann_chunk_ids) - recall = overlap / len(bf_chunk_ids) if bf_chunk_ids else 1.0 - - assert recall >= 0.8, f"ANN recall too low: {recall} (overlap: {overlap}, bf: {bf_chunk_ids}, ann: {ann_chunk_ids})" - - - -class TestBinaryANNIndex: - """Test suite for BinaryANNIndex class (Hamming distance-based search).""" - - @pytest.fixture - def temp_db(self): - """Create a temporary database file.""" - with tempfile.TemporaryDirectory() as tmpdir: - yield Path(tmpdir) / "_index.db" - - @pytest.fixture - def sample_binary_vectors(self): - """Generate sample binary vectors for testing.""" - import numpy as np - np.random.seed(42) - # 100 binary vectors of dimension 256 (packed as 32 bytes each) - binary_unpacked = (np.random.rand(100, 256) > 0.5).astype(np.uint8) - packed = [np.packbits(v).tobytes() for v in binary_unpacked] - return packed, binary_unpacked - - @pytest.fixture - def sample_ids(self): - """Generate sample IDs.""" - return list(range(1, 101)) - - def test_create_binary_index(self, temp_db): - """Test creating a new Binary ANN index.""" - from codexlens.semantic.ann_index import BinaryANNIndex - - index = BinaryANNIndex(temp_db, dim=256) - assert index.dim == 256 - assert index.packed_dim == 32 - assert index.count() == 0 - assert not index.is_loaded - - def test_invalid_dimension(self, temp_db): - """Test that invalid dimensions are rejected.""" - from codexlens.semantic.ann_index import BinaryANNIndex - - # Dimension must be divisible by 8 - with pytest.raises(ValueError, match="divisible by 8"): - BinaryANNIndex(temp_db, dim=255) - - with pytest.raises(ValueError, match="positive"): - BinaryANNIndex(temp_db, dim=0) - - def test_add_packed_vectors(self, temp_db, sample_binary_vectors, sample_ids): - """Test adding packed binary vectors to the index.""" - from codexlens.semantic.ann_index import BinaryANNIndex - - packed, _ = sample_binary_vectors - index = BinaryANNIndex(temp_db, dim=256) - index.add_vectors(sample_ids, packed) - - assert index.count() == 100 - assert index.is_loaded - - def test_add_numpy_vectors(self, temp_db, sample_binary_vectors, sample_ids): - """Test adding unpacked numpy binary vectors.""" - from codexlens.semantic.ann_index import BinaryANNIndex - import numpy as np - - _, unpacked = sample_binary_vectors - index = BinaryANNIndex(temp_db, dim=256) - index.add_vectors_numpy(sample_ids, unpacked) - - assert index.count() == 100 - - def test_search_packed(self, temp_db, sample_binary_vectors, sample_ids): - """Test searching with packed binary query.""" - from codexlens.semantic.ann_index import BinaryANNIndex - - packed, _ = sample_binary_vectors - index = BinaryANNIndex(temp_db, dim=256) - index.add_vectors(sample_ids, packed) - - # Search for the first vector - should find itself with distance 0 - query = packed[0] - ids, distances = index.search(query, top_k=5) - - assert len(ids) == 5 - assert len(distances) == 5 - # First result should be the query vector itself - assert ids[0] == 1 - assert distances[0] == 0 # Hamming distance of 0 (identical) - - def test_search_numpy(self, temp_db, sample_binary_vectors, sample_ids): - """Test searching with unpacked numpy query.""" - from codexlens.semantic.ann_index import BinaryANNIndex - - packed, unpacked = sample_binary_vectors - index = BinaryANNIndex(temp_db, dim=256) - index.add_vectors(sample_ids, packed) - - # Search for the first vector using numpy interface - query = unpacked[0] - ids, distances = index.search_numpy(query, top_k=5) - - assert len(ids) == 5 - assert ids[0] == 1 - assert distances[0] == 0 - - def test_search_batch(self, temp_db, sample_binary_vectors, sample_ids): - """Test batch search with multiple queries.""" - from codexlens.semantic.ann_index import BinaryANNIndex - - packed, _ = sample_binary_vectors - index = BinaryANNIndex(temp_db, dim=256) - index.add_vectors(sample_ids, packed) - - # Search for first 3 vectors - queries = packed[:3] - results = index.search_batch(queries, top_k=5) - - assert len(results) == 3 - # Each result should find itself first - for i, (ids, dists) in enumerate(results): - assert ids[0] == i + 1 - assert dists[0] == 0 - - def test_hamming_distance_ordering(self, temp_db): - """Test that results are ordered by Hamming distance.""" - from codexlens.semantic.ann_index import BinaryANNIndex - import numpy as np - - index = BinaryANNIndex(temp_db, dim=256) - - # Create vectors with known Hamming distances from a query - query = np.zeros(256, dtype=np.uint8) # All zeros - v1 = np.zeros(256, dtype=np.uint8) # Distance 0 - v2 = np.zeros(256, dtype=np.uint8); v2[:10] = 1 # Distance 10 - v3 = np.zeros(256, dtype=np.uint8); v3[:50] = 1 # Distance 50 - v4 = np.ones(256, dtype=np.uint8) # Distance 256 - - index.add_vectors_numpy([1, 2, 3, 4], np.array([v1, v2, v3, v4])) - - query_packed = np.packbits(query).tobytes() - ids, distances = index.search(query_packed, top_k=4) - - assert ids == [1, 2, 3, 4] - assert distances == [0, 10, 50, 256] - - def test_save_and_load(self, temp_db, sample_binary_vectors, sample_ids): - """Test saving and loading binary index from disk.""" - from codexlens.semantic.ann_index import BinaryANNIndex - - packed, _ = sample_binary_vectors - - # Create and save index - index1 = BinaryANNIndex(temp_db, dim=256) - index1.add_vectors(sample_ids, packed) - index1.save() - - # Check that file was created - binary_path = temp_db.parent / f"{temp_db.stem}_binary_vectors.bin" - assert binary_path.exists() - - # Load in new instance - index2 = BinaryANNIndex(temp_db, dim=256) - loaded = index2.load() - - assert loaded is True - assert index2.count() == 100 - assert index2.is_loaded - - # Verify search still works - query = packed[0] - ids, distances = index2.search(query, top_k=5) - assert ids[0] == 1 - assert distances[0] == 0 - - def test_load_nonexistent(self, temp_db): - """Test loading when index file doesn't exist.""" - from codexlens.semantic.ann_index import BinaryANNIndex - - index = BinaryANNIndex(temp_db, dim=256) - loaded = index.load() - - assert loaded is False - assert not index.is_loaded - - def test_remove_vectors(self, temp_db, sample_binary_vectors, sample_ids): - """Test removing vectors from the index.""" - from codexlens.semantic.ann_index import BinaryANNIndex - - packed, _ = sample_binary_vectors - index = BinaryANNIndex(temp_db, dim=256) - index.add_vectors(sample_ids, packed) - - # Remove first 10 vectors - index.remove_vectors(list(range(1, 11))) - - assert index.count() == 90 - - # Removed vectors should not be findable - query = packed[0] - ids, _ = index.search(query, top_k=100) - for removed_id in range(1, 11): - assert removed_id not in ids - - def test_get_vector(self, temp_db, sample_binary_vectors, sample_ids): - """Test retrieving a specific vector by ID.""" - from codexlens.semantic.ann_index import BinaryANNIndex - - packed, _ = sample_binary_vectors - index = BinaryANNIndex(temp_db, dim=256) - index.add_vectors(sample_ids, packed) - - # Get existing vector - vec = index.get_vector(1) - assert vec == packed[0] - - # Get non-existing vector - vec = index.get_vector(9999) - assert vec is None - - def test_clear(self, temp_db, sample_binary_vectors, sample_ids): - """Test clearing all vectors from the index.""" - from codexlens.semantic.ann_index import BinaryANNIndex - - packed, _ = sample_binary_vectors - index = BinaryANNIndex(temp_db, dim=256) - index.add_vectors(sample_ids, packed) - assert index.count() == 100 - - index.clear() - assert index.count() == 0 - assert not index.is_loaded - - def test_search_empty_index(self, temp_db): - """Test searching an empty index.""" - from codexlens.semantic.ann_index import BinaryANNIndex - import numpy as np - - index = BinaryANNIndex(temp_db, dim=256) - query = np.packbits(np.zeros(256, dtype=np.uint8)).tobytes() - - ids, distances = index.search(query, top_k=5) - - assert ids == [] - assert distances == [] - - def test_update_existing_vector(self, temp_db): - """Test updating an existing vector with new data.""" - from codexlens.semantic.ann_index import BinaryANNIndex - import numpy as np - - index = BinaryANNIndex(temp_db, dim=256) - - # Add initial vector - v1 = np.zeros(256, dtype=np.uint8) - index.add_vectors_numpy([1], v1.reshape(1, -1)) - - # Update with different vector - v2 = np.ones(256, dtype=np.uint8) - index.add_vectors_numpy([1], v2.reshape(1, -1)) - - # Count should still be 1 - assert index.count() == 1 - - # Retrieved vector should be the updated one - stored = index.get_vector(1) - expected = np.packbits(v2).tobytes() - assert stored == expected - - -class TestCreateAnnIndexFactory: - """Test suite for create_ann_index factory function.""" - - @pytest.fixture - def temp_db(self): - """Create a temporary database file.""" - with tempfile.TemporaryDirectory() as tmpdir: - yield Path(tmpdir) / "_index.db" - - @pytest.mark.skipif( - not _hnswlib_available(), - reason="hnswlib not installed" - ) - def test_create_hnsw_index(self, temp_db): - """Test creating HNSW index via factory.""" - from codexlens.semantic.ann_index import create_ann_index, ANNIndex - - index = create_ann_index(temp_db, index_type="hnsw", dim=384) - assert isinstance(index, ANNIndex) - assert index.dim == 384 - - def test_create_binary_index(self, temp_db): - """Test creating binary index via factory.""" - from codexlens.semantic.ann_index import create_ann_index, BinaryANNIndex - - index = create_ann_index(temp_db, index_type="binary", dim=256) - assert isinstance(index, BinaryANNIndex) - assert index.dim == 256 - - def test_create_binary_index_default_dim(self, temp_db): - """Test that binary index defaults to 256 dim when dense default is used.""" - from codexlens.semantic.ann_index import create_ann_index, BinaryANNIndex - - # When dim=2048 (dense default) is passed with binary type, - # it should auto-adjust to 256 - index = create_ann_index(temp_db, index_type="binary") - assert isinstance(index, BinaryANNIndex) - assert index.dim == 256 - - def test_invalid_index_type(self, temp_db): - """Test that invalid index type raises error.""" - from codexlens.semantic.ann_index import create_ann_index - - with pytest.raises(ValueError, match="Invalid index_type"): - create_ann_index(temp_db, index_type="invalid") - - def test_case_insensitive_index_type(self, temp_db): - """Test that index_type is case-insensitive.""" - from codexlens.semantic.ann_index import create_ann_index, BinaryANNIndex - - index = create_ann_index(temp_db, index_type="BINARY", dim=256) - assert isinstance(index, BinaryANNIndex) diff --git a/codex-lens/tests/test_api_reranker.py b/codex-lens/tests/test_api_reranker.py deleted file mode 100644 index 58ff3a4a..00000000 --- a/codex-lens/tests/test_api_reranker.py +++ /dev/null @@ -1,200 +0,0 @@ -"""Tests for APIReranker backend.""" - -from __future__ import annotations - -import sys -import types -from typing import Any - -import pytest - -from codexlens.semantic.reranker import get_reranker -from codexlens.semantic.reranker.api_reranker import APIReranker - - -class DummyResponse: - def __init__( - self, - *, - status_code: int = 200, - json_data: Any = None, - text: str = "", - headers: dict[str, str] | None = None, - ) -> None: - self.status_code = int(status_code) - self._json_data = json_data - self.text = text - self.headers = headers or {} - - def json(self) -> Any: - return self._json_data - - -class DummyClient: - def __init__(self, *, base_url: str | None = None, headers: dict[str, str] | None = None, timeout: float | None = None) -> None: - self.base_url = base_url - self.headers = headers or {} - self.timeout = timeout - self.closed = False - self.calls: list[dict[str, Any]] = [] - self._responses: list[DummyResponse] = [] - - def queue(self, response: DummyResponse) -> None: - self._responses.append(response) - - def post(self, endpoint: str, *, json: dict[str, Any] | None = None) -> DummyResponse: - self.calls.append({"endpoint": endpoint, "json": json}) - if not self._responses: - raise AssertionError("DummyClient has no queued responses") - return self._responses.pop(0) - - def close(self) -> None: - self.closed = True - - -@pytest.fixture -def httpx_clients(monkeypatch: pytest.MonkeyPatch) -> list[DummyClient]: - clients: list[DummyClient] = [] - - dummy_httpx = types.ModuleType("httpx") - - def Client(*, base_url: str | None = None, headers: dict[str, str] | None = None, timeout: float | None = None) -> DummyClient: - client = DummyClient(base_url=base_url, headers=headers, timeout=timeout) - clients.append(client) - return client - - dummy_httpx.Client = Client - monkeypatch.setitem(sys.modules, "httpx", dummy_httpx) - - return clients - - -def test_api_reranker_requires_api_key( - monkeypatch: pytest.MonkeyPatch, httpx_clients: list[DummyClient] -) -> None: - # Force empty key in-process so the reranker does not fall back to any - # workspace/global .env configuration that may exist on the machine. - monkeypatch.setenv("RERANKER_API_KEY", "") - monkeypatch.setenv("CODEXLENS_RERANKER_API_KEY", "") - - with pytest.raises(ValueError, match="Missing API key"): - APIReranker() - - assert httpx_clients == [] - - -def test_api_reranker_reads_api_key_from_env( - monkeypatch: pytest.MonkeyPatch, httpx_clients: list[DummyClient] -) -> None: - monkeypatch.setenv("RERANKER_API_KEY", "test-key") - - reranker = APIReranker() - assert len(httpx_clients) == 1 - assert httpx_clients[0].headers["Authorization"] == "Bearer test-key" - reranker.close() - assert httpx_clients[0].closed is True - - -def test_api_reranker_strips_v1_from_api_base_to_avoid_double_v1( - monkeypatch: pytest.MonkeyPatch, httpx_clients: list[DummyClient] -) -> None: - monkeypatch.setenv("RERANKER_API_KEY", "test-key") - - reranker = APIReranker(api_base="https://api.siliconflow.cn/v1", provider="siliconflow") - assert len(httpx_clients) == 1 - # Endpoint already includes /v1, so api_base should not. - assert httpx_clients[0].base_url == "https://api.siliconflow.cn" - reranker.close() - - -def test_api_reranker_strips_endpoint_from_api_base_to_avoid_double_endpoint( - monkeypatch: pytest.MonkeyPatch, httpx_clients: list[DummyClient] -) -> None: - monkeypatch.setenv("RERANKER_API_KEY", "test-key") - - reranker = APIReranker(api_base="https://api.siliconflow.cn/v1/rerank", provider="siliconflow") - assert len(httpx_clients) == 1 - # If api_base already includes the endpoint suffix, strip it. - assert httpx_clients[0].base_url == "https://api.siliconflow.cn" - reranker.close() - - -def test_api_reranker_scores_pairs_siliconflow( - monkeypatch: pytest.MonkeyPatch, httpx_clients: list[DummyClient] -) -> None: - monkeypatch.delenv("RERANKER_API_KEY", raising=False) - # Avoid picking up any machine-local default model from global .env. - monkeypatch.setenv("RERANKER_MODEL", "") - monkeypatch.setenv("CODEXLENS_RERANKER_MODEL", "") - - reranker = APIReranker(api_key="k", provider="siliconflow") - client = httpx_clients[0] - - client.queue( - DummyResponse( - json_data={ - "results": [ - {"index": 0, "relevance_score": 0.9}, - {"index": 1, "relevance_score": 0.1}, - ] - } - ) - ) - - scores = reranker.score_pairs([("q", "d1"), ("q", "d2")]) - assert scores == pytest.approx([0.9, 0.1]) - - assert client.calls[0]["endpoint"] == "/v1/rerank" - payload = client.calls[0]["json"] - assert payload["model"] == "BAAI/bge-reranker-v2-m3" - assert payload["query"] == "q" - assert payload["documents"] == ["d1", "d2"] - assert payload["top_n"] == 2 - assert payload["return_documents"] is False - - -def test_api_reranker_retries_on_5xx( - monkeypatch: pytest.MonkeyPatch, httpx_clients: list[DummyClient] -) -> None: - monkeypatch.setenv("RERANKER_API_KEY", "k") - - from codexlens.semantic.reranker import api_reranker as api_reranker_module - - monkeypatch.setattr(api_reranker_module.time, "sleep", lambda *_args, **_kwargs: None) - - reranker = APIReranker(max_retries=1) - client = httpx_clients[0] - - client.queue(DummyResponse(status_code=500, text="oops", json_data={"error": "oops"})) - client.queue( - DummyResponse( - json_data={"results": [{"index": 0, "relevance_score": 0.7}]}, - ) - ) - - scores = reranker.score_pairs([("q", "d")]) - assert scores == pytest.approx([0.7]) - assert len(client.calls) == 2 - - -def test_api_reranker_unauthorized_raises( - monkeypatch: pytest.MonkeyPatch, httpx_clients: list[DummyClient] -) -> None: - monkeypatch.setenv("RERANKER_API_KEY", "k") - - reranker = APIReranker() - client = httpx_clients[0] - client.queue(DummyResponse(status_code=401, text="unauthorized")) - - with pytest.raises(RuntimeError, match="unauthorized"): - reranker.score_pairs([("q", "d")]) - - -def test_factory_api_backend_constructs_reranker( - monkeypatch: pytest.MonkeyPatch, httpx_clients: list[DummyClient] -) -> None: - monkeypatch.setenv("RERANKER_API_KEY", "k") - - reranker = get_reranker(backend="api") - assert isinstance(reranker, APIReranker) - assert len(httpx_clients) == 1 diff --git a/codex-lens/tests/test_association_tree.py b/codex-lens/tests/test_association_tree.py deleted file mode 100644 index ea947d80..00000000 --- a/codex-lens/tests/test_association_tree.py +++ /dev/null @@ -1,400 +0,0 @@ -"""Unit tests for association tree building and deduplication. - -Tests the AssociationTreeBuilder and ResultDeduplicator components using -mocked LSP responses. -""" - -from __future__ import annotations - -import asyncio -from typing import Any, Dict, List -from unittest.mock import AsyncMock, MagicMock, patch - -import pytest - -from codexlens.hybrid_search.data_structures import CallHierarchyItem, Range -from codexlens.search.association_tree import ( - AssociationTreeBuilder, - CallTree, - ResultDeduplicator, - TreeNode, - UniqueNode, -) - - -class MockLspManager: - """Mock LSP manager for testing.""" - - def __init__(self): - """Initialize mock with empty responses.""" - self.call_hierarchy_items: Dict[str, List[Dict]] = {} - self.incoming_calls: Dict[str, List[Dict]] = {} - self.outgoing_calls: Dict[str, List[Dict]] = {} - - async def get_call_hierarchy_items( - self, file_path: str, line: int, character: int, wait_for_analysis: float = 0.0 - ) -> List[Dict]: - """Mock get_call_hierarchy_items.""" - key = f"{file_path}:{line}:{character}" - return self.call_hierarchy_items.get(key, []) - - async def get_incoming_calls(self, item: Dict[str, Any]) -> List[Dict]: - """Mock get_incoming_calls.""" - name = item.get("name", "") - return self.incoming_calls.get(name, []) - - async def get_outgoing_calls(self, item: Dict[str, Any]) -> List[Dict]: - """Mock get_outgoing_calls.""" - name = item.get("name", "") - return self.outgoing_calls.get(name, []) - - -def create_mock_item( - name: str, - file_path: str, - start_line: int, - end_line: int, - kind: str = "function", -) -> Dict[str, Any]: - """Create a mock CallHierarchyItem dict. - - Args: - name: Symbol name - file_path: File path - start_line: Start line (0-based for LSP) - end_line: End line (0-based for LSP) - kind: Symbol kind - - Returns: - LSP CallHierarchyItem dict - """ - return { - "name": name, - "kind": kind, - "uri": f"file:///{file_path}", - "range": { - "start": {"line": start_line, "character": 0}, - "end": {"line": end_line, "character": 0}, - }, - "detail": f"def {name}(...)", - } - - -@pytest.mark.asyncio -async def test_simple_tree_building(): - """Test building a simple tree with one root and one callee.""" - mock_lsp = MockLspManager() - - # Root function - root_item = create_mock_item("main", "test.py", 10, 15) - - # Callee function - callee_item = create_mock_item("helper", "test.py", 20, 25) - - # Setup mock responses - mock_lsp.call_hierarchy_items["test.py:11:1"] = [root_item] - mock_lsp.outgoing_calls["main"] = [{"to": callee_item}] - mock_lsp.incoming_calls["main"] = [] - mock_lsp.outgoing_calls["helper"] = [] - mock_lsp.incoming_calls["helper"] = [] - - # Build tree - builder = AssociationTreeBuilder(mock_lsp) - tree = await builder.build_tree( - seed_file_path="test.py", - seed_line=11, - seed_character=1, - max_depth=2, - expand_callers=False, - expand_callees=True, - ) - - # Assertions - assert len(tree.roots) == 1 - assert tree.roots[0].item.name == "main" - assert len(tree.roots[0].children) == 1 - assert tree.roots[0].children[0].item.name == "helper" - assert len(tree.all_nodes) == 2 - - -@pytest.mark.asyncio -async def test_tree_with_cycle_detection(): - """Test that cycles are properly detected and marked.""" - mock_lsp = MockLspManager() - - # Create circular reference: A -> B -> A - item_a = create_mock_item("func_a", "test.py", 10, 15) - item_b = create_mock_item("func_b", "test.py", 20, 25) - - # Setup mock responses - mock_lsp.call_hierarchy_items["test.py:11:1"] = [item_a] - mock_lsp.outgoing_calls["func_a"] = [{"to": item_b}] - mock_lsp.outgoing_calls["func_b"] = [{"to": item_a}] # Cycle - mock_lsp.incoming_calls["func_a"] = [] - mock_lsp.incoming_calls["func_b"] = [] - - # Build tree - builder = AssociationTreeBuilder(mock_lsp) - tree = await builder.build_tree( - seed_file_path="test.py", - seed_line=11, - seed_character=1, - max_depth=5, - expand_callers=False, - expand_callees=True, - ) - - # Should have 2 unique nodes (func_a and func_b) - assert len(tree.all_nodes) == 2 - - # func_b should have a cycle child pointing back to func_a - func_b_node = None - for node in tree.node_list: - if node.item.name == "func_b": - func_b_node = node - break - - assert func_b_node is not None - assert len(func_b_node.children) == 1 - assert func_b_node.children[0].is_cycle - assert func_b_node.children[0].item.name == "func_a" - - -@pytest.mark.asyncio -async def test_max_depth_limit(): - """Test that expansion stops at max_depth.""" - mock_lsp = MockLspManager() - - # Chain: A -> B -> C -> D - items = { - "A": create_mock_item("func_a", "test.py", 10, 15), - "B": create_mock_item("func_b", "test.py", 20, 25), - "C": create_mock_item("func_c", "test.py", 30, 35), - "D": create_mock_item("func_d", "test.py", 40, 45), - } - - mock_lsp.call_hierarchy_items["test.py:11:1"] = [items["A"]] - mock_lsp.outgoing_calls["func_a"] = [{"to": items["B"]}] - mock_lsp.outgoing_calls["func_b"] = [{"to": items["C"]}] - mock_lsp.outgoing_calls["func_c"] = [{"to": items["D"]}] - mock_lsp.outgoing_calls["func_d"] = [] - - for name in ["func_a", "func_b", "func_c", "func_d"]: - mock_lsp.incoming_calls[name] = [] - - # Build tree with max_depth=2 - builder = AssociationTreeBuilder(mock_lsp) - tree = await builder.build_tree( - seed_file_path="test.py", - seed_line=11, - max_depth=2, - expand_callers=False, - expand_callees=True, - ) - - # Should only have nodes A, B, C (depths 0, 1, 2) - # D should not be included (would be depth 3) - assert len(tree.all_nodes) == 3 - node_names = {node.item.name for node in tree.node_list} - assert "func_a" in node_names - assert "func_b" in node_names - assert "func_c" in node_names - assert "func_d" not in node_names - - -@pytest.mark.asyncio -async def test_empty_tree(): - """Test building tree when no call hierarchy items found.""" - mock_lsp = MockLspManager() - - # No items configured - builder = AssociationTreeBuilder(mock_lsp) - tree = await builder.build_tree( - seed_file_path="test.py", - seed_line=11, - max_depth=2, - ) - - # Should have empty tree - assert len(tree.roots) == 0 - assert len(tree.all_nodes) == 0 - - -def test_deduplication_basic(): - """Test basic deduplication of tree nodes.""" - # Create test tree with duplicate nodes - tree = CallTree() - - # Same function appearing at different depths via different paths - # This simulates the real scenario where a function appears multiple times - # in a call tree (e.g., reached from different callers) - item_a1 = CallHierarchyItem( - name="func_a", - kind="function", - file_path="test.py", - range=Range(10, 0, 15, 0), - ) - item_a2 = CallHierarchyItem( - name="func_a", - kind="function", - file_path="test.py", - range=Range(10, 0, 15, 0), # Same range - ) - - node1 = TreeNode(item=item_a1, depth=0, path_from_root=["node1"]) - node2 = TreeNode(item=item_a2, depth=2, path_from_root=["root", "mid", "node2"]) - - # Manually add to node_list to simulate same symbol from different paths - tree.node_list.append(node1) - tree.node_list.append(node2) - - # Different function - item_b = CallHierarchyItem( - name="func_b", - kind="function", - file_path="test.py", - range=Range(20, 0, 25, 0), - ) - node3 = TreeNode(item=item_b, depth=1, path_from_root=["root", "node3"]) - tree.node_list.append(node3) - - # Deduplicate - deduplicator = ResultDeduplicator() - unique_nodes = deduplicator.deduplicate(tree) - - # Should have 2 unique nodes (func_a merged, func_b separate) - assert len(unique_nodes) == 2 - - # func_a should have occurrences=2 and min_depth=0 - func_a_node = next(n for n in unique_nodes if n.name == "func_a") - assert func_a_node.occurrences == 2 - assert func_a_node.min_depth == 0 - - # func_b should have occurrences=1 and min_depth=1 - func_b_node = next(n for n in unique_nodes if n.name == "func_b") - assert func_b_node.occurrences == 1 - assert func_b_node.min_depth == 1 - - -def test_deduplication_scoring(): - """Test that scoring prioritizes depth and frequency correctly.""" - tree = CallTree() - - # Create nodes with different characteristics - # Node at depth 0 (root) - item1 = CallHierarchyItem( - name="root_func", - kind="function", - file_path="test.py", - range=Range(10, 0, 15, 0), - ) - node1 = TreeNode(item=item1, depth=0) - tree.add_node(node1) - - # Node at depth 5 (deep) - item2 = CallHierarchyItem( - name="deep_func", - kind="function", - file_path="test.py", - range=Range(20, 0, 25, 0), - ) - node2 = TreeNode(item=item2, depth=5) - tree.add_node(node2) - - # Deduplicate and score - deduplicator = ResultDeduplicator() - unique_nodes = deduplicator.deduplicate(tree) - - # Root node should score higher than deep node - root_node = next(n for n in unique_nodes if n.name == "root_func") - deep_node = next(n for n in unique_nodes if n.name == "deep_func") - - assert root_node.score > deep_node.score - - -def test_deduplication_max_results(): - """Test that max_results limit works correctly.""" - tree = CallTree() - - # Create 5 unique nodes - for i in range(5): - item = CallHierarchyItem( - name=f"func_{i}", - kind="function", - file_path="test.py", - range=Range(i * 10, 0, i * 10 + 5, 0), - ) - node = TreeNode(item=item, depth=i) - tree.add_node(node) - - # Deduplicate with max_results=3 - deduplicator = ResultDeduplicator() - unique_nodes = deduplicator.deduplicate(tree, max_results=3) - - # Should only return 3 nodes - assert len(unique_nodes) == 3 - - -def test_filter_by_kind(): - """Test filtering unique nodes by symbol kind.""" - # Create unique nodes with different kinds - nodes = [ - UniqueNode( - file_path="test.py", - name="func1", - kind="function", - range=Range(10, 0, 15, 0), - ), - UniqueNode( - file_path="test.py", - name="cls1", - kind="class", - range=Range(20, 0, 30, 0), - ), - UniqueNode( - file_path="test.py", - name="var1", - kind="variable", - range=Range(40, 0, 40, 10), - ), - ] - - deduplicator = ResultDeduplicator() - - # Filter for functions only - filtered = deduplicator.filter_by_kind(nodes, ["function"]) - assert len(filtered) == 1 - assert filtered[0].name == "func1" - - # Filter for functions and classes - filtered = deduplicator.filter_by_kind(nodes, ["function", "class"]) - assert len(filtered) == 2 - - -def test_to_dict_list(): - """Test conversion of unique nodes to dict list.""" - nodes = [ - UniqueNode( - file_path="test.py", - name="func1", - kind="function", - range=Range(10, 0, 15, 0), - min_depth=0, - occurrences=2, - score=0.85, - ), - ] - - deduplicator = ResultDeduplicator() - dict_list = deduplicator.to_dict_list(nodes) - - assert len(dict_list) == 1 - assert dict_list[0]["name"] == "func1" - assert dict_list[0]["kind"] == "function" - assert dict_list[0]["min_depth"] == 0 - assert dict_list[0]["occurrences"] == 2 - assert dict_list[0]["score"] == 0.85 - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/codex-lens/tests/test_astgrep_binding.py b/codex-lens/tests/test_astgrep_binding.py deleted file mode 100644 index 7a154845..00000000 --- a/codex-lens/tests/test_astgrep_binding.py +++ /dev/null @@ -1,191 +0,0 @@ -"""Tests for ast-grep binding module. - -Verifies basic import and functionality of AstGrepBinding. -Run with: python -m pytest tests/test_astgrep_binding.py -v -""" - -from __future__ import annotations - -import pytest -from pathlib import Path - - -class TestAstGrepBindingAvailability: - """Test availability checks.""" - - def test_is_astgrep_available_function(self): - """Test is_astgrep_available function returns boolean.""" - from codexlens.parsers.astgrep_binding import is_astgrep_available - result = is_astgrep_available() - assert isinstance(result, bool) - - def test_get_supported_languages(self): - """Test get_supported_languages returns expected languages.""" - from codexlens.parsers.astgrep_binding import get_supported_languages - languages = get_supported_languages() - assert isinstance(languages, list) - assert "python" in languages - assert "javascript" in languages - assert "typescript" in languages - - -class TestAstGrepBindingInit: - """Test AstGrepBinding initialization.""" - - def test_init_python(self): - """Test initialization with Python language.""" - from codexlens.parsers.astgrep_binding import AstGrepBinding - binding = AstGrepBinding("python") - assert binding.language_id == "python" - - def test_init_typescript_with_tsx(self): - """Test TSX detection from file extension.""" - from codexlens.parsers.astgrep_binding import AstGrepBinding - binding = AstGrepBinding("typescript", Path("component.tsx")) - assert binding.language_id == "typescript" - - def test_is_available_returns_boolean(self): - """Test is_available returns boolean.""" - from codexlens.parsers.astgrep_binding import AstGrepBinding - binding = AstGrepBinding("python") - result = binding.is_available() - assert isinstance(result, bool) - - -def _is_astgrep_installed(): - """Check if ast-grep-py is installed.""" - try: - import ast_grep_py # noqa: F401 - return True - except ImportError: - return False - - -@pytest.mark.skipif( - not _is_astgrep_installed(), - reason="ast-grep-py not installed" -) -class TestAstGrepBindingWithAstGrep: - """Tests that require ast-grep-py to be installed.""" - - def test_parse_simple_python(self): - """Test parsing simple Python code.""" - from codexlens.parsers.astgrep_binding import AstGrepBinding - binding = AstGrepBinding("python") - - if not binding.is_available(): - pytest.skip("ast-grep not available") - - source = "x = 1" - result = binding.parse(source) - assert result is True - - def test_find_inheritance(self): - """Test finding class inheritance.""" - from codexlens.parsers.astgrep_binding import AstGrepBinding - binding = AstGrepBinding("python") - - if not binding.is_available(): - pytest.skip("ast-grep not available") - - source = """ -class MyClass(BaseClass): - pass -""" - binding.parse(source) - results = binding.find_inheritance() - assert len(results) >= 0 # May or may not find depending on pattern match - - def test_find_calls(self): - """Test finding function calls.""" - from codexlens.parsers.astgrep_binding import AstGrepBinding - binding = AstGrepBinding("python") - - if not binding.is_available(): - pytest.skip("ast-grep not available") - - source = """ -def foo(): - bar() - baz.qux() -""" - binding.parse(source) - results = binding.find_calls() - assert isinstance(results, list) - - def test_find_imports(self): - """Test finding import statements.""" - from codexlens.parsers.astgrep_binding import AstGrepBinding - binding = AstGrepBinding("python") - - if not binding.is_available(): - pytest.skip("ast-grep not available") - - source = """ -import os -from typing import List -""" - binding.parse(source) - results = binding.find_imports() - assert isinstance(results, list) - - -def test_basic_import(): - """Test that the module can be imported.""" - try: - from codexlens.parsers.astgrep_binding import ( - AstGrepBinding, - is_astgrep_available, - get_supported_languages, - ASTGREP_AVAILABLE, - ) - assert True - except ImportError as e: - pytest.fail(f"Failed to import astgrep_binding: {e}") - - -def test_availability_flag(): - """Test ASTGREP_AVAILABLE flag is defined.""" - from codexlens.parsers.astgrep_binding import ASTGREP_AVAILABLE - assert isinstance(ASTGREP_AVAILABLE, bool) - - -if __name__ == "__main__": - # Run basic verification - print("Testing astgrep_binding module...") - - from codexlens.parsers.astgrep_binding import ( - AstGrepBinding, - is_astgrep_available, - get_supported_languages, - ) - - print(f"ast-grep available: {is_astgrep_available()}") - print(f"Supported languages: {get_supported_languages()}") - - binding = AstGrepBinding("python") - print(f"Python binding available: {binding.is_available()}") - - if binding.is_available(): - test_code = """ -import os -from typing import List - -class MyClass(BaseClass): - def method(self): - self.helper() - external_func() - -def helper(): - pass -""" - binding.parse(test_code) - print(f"Inheritance found: {binding.find_inheritance()}") - print(f"Calls found: {binding.find_calls()}") - print(f"Imports found: {binding.find_imports()}") - else: - print("Note: ast-grep-py not installed. To install:") - print(" pip install ast-grep-py") - print(" Note: May have compatibility issues with Python 3.13") - - print("Basic verification complete!") diff --git a/codex-lens/tests/test_binary_searcher.py b/codex-lens/tests/test_binary_searcher.py deleted file mode 100644 index eb751bb6..00000000 --- a/codex-lens/tests/test_binary_searcher.py +++ /dev/null @@ -1,161 +0,0 @@ -"""Unit tests for BinarySearcher - binary vector search using Hamming distance. - -Tests cover: -- load: mmap file loading, DB fallback, no data scenario -- search: basic search, top_k limit, empty index -""" - -from __future__ import annotations - -import json -import tempfile -from pathlib import Path -from unittest.mock import MagicMock, patch, mock_open - -import numpy as np -import pytest - -from codexlens.search.binary_searcher import BinarySearcher - - -# ============================================================================= -# Test Fixtures -# ============================================================================= - - -@pytest.fixture -def temp_paths(): - """Create temporary directory structure.""" - tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) - root = Path(tmpdir.name) - yield root - try: - tmpdir.cleanup() - except (PermissionError, OSError): - pass - - -@pytest.fixture -def binary_mmap_setup(temp_paths): - """Create a mock memory-mapped binary vectors file with metadata.""" - num_vectors = 10 - dim_bytes = 32 # 256 bits = 32 bytes - - # Create binary matrix - rng = np.random.default_rng(42) - binary_matrix = rng.integers(0, 256, size=(num_vectors, dim_bytes), dtype=np.uint8) - chunk_ids = list(range(100, 100 + num_vectors)) - - # Write mmap file - mmap_path = temp_paths / "_binary_vectors.mmap" - binary_matrix.tofile(str(mmap_path)) - - # Write metadata - meta_path = mmap_path.with_suffix(".meta.json") - meta = { - "shape": [num_vectors, dim_bytes], - "chunk_ids": chunk_ids, - } - with open(meta_path, "w") as f: - json.dump(meta, f) - - return temp_paths, binary_matrix, chunk_ids - - -# ============================================================================= -# Tests: load -# ============================================================================= - - -class TestBinarySearcherLoad: - """Tests for BinarySearcher.load().""" - - def test_load_mmap(self, binary_mmap_setup): - """Memory-mapped file loading should succeed and mark is_memmap.""" - index_root, binary_matrix, chunk_ids = binary_mmap_setup - searcher = BinarySearcher(index_root) - - result = searcher.load() - - assert result is True - assert searcher._loaded is True - assert searcher.is_memmap is True - assert searcher.vector_count == len(chunk_ids) - - def test_load_db_fallback(self, temp_paths): - """Should fall back to DB loading when no mmap file exists.""" - searcher = BinarySearcher(temp_paths) - - # Mock the DB fallback - with patch.object(searcher, "_load_from_db", return_value=True) as mock_db: - result = searcher.load() - - assert result is True - mock_db.assert_called_once() - - def test_load_no_data(self, temp_paths): - """Should return False when neither mmap nor DB data available.""" - searcher = BinarySearcher(temp_paths) - - with patch.object(searcher, "_load_from_db", return_value=False): - result = searcher.load() - - assert result is False - assert searcher._loaded is False - - -# ============================================================================= -# Tests: search -# ============================================================================= - - -class TestBinarySearcherSearch: - """Tests for BinarySearcher.search().""" - - def test_search_basic(self, binary_mmap_setup): - """Basic search should return (chunk_id, distance) tuples.""" - index_root, binary_matrix, chunk_ids = binary_mmap_setup - searcher = BinarySearcher(index_root) - searcher.load() - - # Create a query vector (256 dimensions, will be binarized) - rng = np.random.default_rng(99) - query_vector = rng.standard_normal(256).astype(np.float32) - - results = searcher.search(query_vector, top_k=5) - - assert len(results) == 5 - # Results should be (chunk_id, hamming_distance) tuples - for chunk_id, distance in results: - assert isinstance(chunk_id, int) - assert isinstance(distance, int) - assert chunk_id in chunk_ids - - def test_search_top_k(self, binary_mmap_setup): - """Search should respect top_k limit.""" - index_root, binary_matrix, chunk_ids = binary_mmap_setup - searcher = BinarySearcher(index_root) - searcher.load() - - query_vector = np.random.default_rng(42).standard_normal(256).astype(np.float32) - - results_3 = searcher.search(query_vector, top_k=3) - results_7 = searcher.search(query_vector, top_k=7) - - assert len(results_3) == 3 - assert len(results_7) == 7 - # Results should be sorted by distance (ascending) - distances_3 = [d for _, d in results_3] - assert distances_3 == sorted(distances_3) - - def test_search_empty_index(self, temp_paths): - """Search on empty/unloaded index should return empty list.""" - searcher = BinarySearcher(temp_paths) - # Do not load - index is empty - - query_vector = np.zeros(256, dtype=np.float32) - - with patch.object(searcher, "load", return_value=False): - results = searcher.search(query_vector, top_k=5) - - assert results == [] diff --git a/codex-lens/tests/test_cascade_strategies.py b/codex-lens/tests/test_cascade_strategies.py deleted file mode 100644 index 2a4713b4..00000000 --- a/codex-lens/tests/test_cascade_strategies.py +++ /dev/null @@ -1,392 +0,0 @@ -"""Integration tests for chain_search.py cascade strategies. - -Tests cover: -- binary_cascade_search: Full pipeline and numpy-unavailable fallback -- binary_rerank_cascade_search: Pipeline and fallback -- dense_rerank_cascade_search: Pipeline and fallback -- cascade_search: Router dispatching to correct strategy methods -""" - -from __future__ import annotations - -import tempfile -from pathlib import Path -from typing import List -from unittest.mock import MagicMock, Mock, patch - -import pytest - -from codexlens.config import Config -from codexlens.entities import SearchResult -from codexlens.search.chain_search import ( - ChainSearchEngine, - ChainSearchResult, - SearchOptions, - SearchStats, -) -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - - -# ============================================================================= -# Test Fixtures -# ============================================================================= - - -@pytest.fixture -def temp_paths(): - """Create temporary directory structure.""" - tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) - root = Path(tmpdir.name) - yield root - try: - tmpdir.cleanup() - except (PermissionError, OSError): - pass - - -@pytest.fixture -def mock_registry(temp_paths: Path): - """Create mock registry store.""" - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - return registry - - -@pytest.fixture -def mock_mapper(temp_paths: Path): - """Create path mapper.""" - return PathMapper(index_root=temp_paths / "indexes") - - -@pytest.fixture -def mock_config(): - """Create mock config for cascade search.""" - config = MagicMock(spec=Config) - config.cascade_coarse_k = 100 - config.cascade_fine_k = 10 - config.cascade_strategy = "binary" - config.enable_staged_rerank = False - config.staged_clustering_strategy = "auto" - config.staged_clustering_min_size = 3 - config.graph_expansion_depth = 2 - return config - - -@pytest.fixture -def sample_search_results() -> List[SearchResult]: - """Create sample search results for testing.""" - return [ - SearchResult(path="a.py", score=0.9, excerpt="def auth():"), - SearchResult(path="b.py", score=0.8, excerpt="class User:"), - SearchResult(path="c.py", score=0.7, excerpt="def login():"), - ] - - -# ============================================================================= -# Tests: binary_cascade_search -# ============================================================================= - - -class TestBinaryCascadeSearch: - """Tests for binary_cascade_search().""" - - def test_binary_cascade_full_pipeline( - self, mock_registry, mock_mapper, mock_config, temp_paths - ): - """binary_cascade_search should execute full binary+dense pipeline.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch.object(engine, "_find_start_index") as mock_find: - mock_find.return_value = temp_paths / "index" / "_index.db" - - with patch.object(engine, "_collect_index_paths") as mock_collect: - mock_collect.return_value = [temp_paths / "index" / "_index.db"] - - # Mock the embedding backend imports - with patch("codexlens.search.chain_search.NUMPY_AVAILABLE", True): - with patch.dict("sys.modules", { - "codexlens.indexing.embedding": MagicMock(), - "codexlens.semantic.ann_index": MagicMock(), - }): - # Mock _get_or_create_binary_index - with patch.object( - engine, "_get_or_create_binary_index" - ) as mock_bin: - mock_index = MagicMock() - mock_index.count.return_value = 10 - mock_index.search.return_value = ([1, 2], [10, 20]) - mock_bin.return_value = mock_index - - # The search should fall back to standard on import issues - with patch.object(engine, "search") as mock_search: - mock_search.return_value = ChainSearchResult( - query="test", - results=[SearchResult(path="a.py", score=0.9, excerpt="a")], - symbols=[], - stats=SearchStats(), - ) - - result = engine.binary_cascade_search( - "test query", temp_paths / "src", - k=10, coarse_k=100, - ) - - assert isinstance(result, ChainSearchResult) - - def test_binary_cascade_numpy_unavailable( - self, mock_registry, mock_mapper, mock_config, temp_paths - ): - """binary_cascade_search should fall back to standard search when numpy unavailable.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch("codexlens.search.chain_search.NUMPY_AVAILABLE", False): - with patch.object(engine, "search") as mock_search: - mock_search.return_value = ChainSearchResult( - query="test", - results=[], - symbols=[], - stats=SearchStats(), - ) - - result = engine.binary_cascade_search( - "query", temp_paths / "src", - ) - - mock_search.assert_called_once() - assert isinstance(result, ChainSearchResult) - - -# ============================================================================= -# Tests: binary_rerank_cascade_search -# ============================================================================= - - -class TestBinaryRerankCascadeSearch: - """Tests for binary_rerank_cascade_search().""" - - def test_binary_rerank_cascade_pipeline( - self, mock_registry, mock_mapper, mock_config, temp_paths - ): - """binary_rerank_cascade_search should execute binary+cross-encoder pipeline.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch("codexlens.search.chain_search.NUMPY_AVAILABLE", True): - with patch.object(engine, "_find_start_index") as mock_find: - mock_find.return_value = temp_paths / "index" / "_index.db" - - with patch.object(engine, "_collect_index_paths") as mock_collect: - mock_collect.return_value = [temp_paths / "index" / "_index.db"] - - # Mock BinaryEmbeddingBackend import - with patch.dict("sys.modules", { - "codexlens.indexing.embedding": MagicMock(), - }): - with patch.object(engine, "search") as mock_search: - mock_search.return_value = ChainSearchResult( - query="test", - results=[SearchResult(path="a.py", score=0.9, excerpt="a")], - symbols=[], - stats=SearchStats(), - ) - - result = engine.binary_rerank_cascade_search( - "test query", temp_paths / "src", - k=10, coarse_k=100, - ) - - assert isinstance(result, ChainSearchResult) - - def test_binary_rerank_fallback( - self, mock_registry, mock_mapper, mock_config, temp_paths - ): - """binary_rerank_cascade_search should fall back when numpy unavailable.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch("codexlens.search.chain_search.NUMPY_AVAILABLE", False): - with patch.object(engine, "search") as mock_search: - mock_search.return_value = ChainSearchResult( - query="test", - results=[], - symbols=[], - stats=SearchStats(), - ) - - result = engine.binary_rerank_cascade_search( - "query", temp_paths / "src", - ) - - mock_search.assert_called_once() - - -# ============================================================================= -# Tests: dense_rerank_cascade_search -# ============================================================================= - - -class TestDenseRerankCascadeSearch: - """Tests for dense_rerank_cascade_search().""" - - def test_dense_rerank_cascade_pipeline( - self, mock_registry, mock_mapper, mock_config, temp_paths - ): - """dense_rerank_cascade_search should execute dense+cross-encoder pipeline.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch("codexlens.search.chain_search.NUMPY_AVAILABLE", True): - with patch.object(engine, "_find_start_index") as mock_find: - mock_find.return_value = temp_paths / "index" / "_index.db" - - with patch.object(engine, "_collect_index_paths") as mock_collect: - mock_collect.return_value = [temp_paths / "index" / "_index.db"] - - with patch.object(engine, "search") as mock_search: - mock_search.return_value = ChainSearchResult( - query="test", - results=[SearchResult(path="a.py", score=0.9, excerpt="a")], - symbols=[], - stats=SearchStats(), - ) - - result = engine.dense_rerank_cascade_search( - "test query", temp_paths / "src", - k=10, coarse_k=100, - ) - - assert isinstance(result, ChainSearchResult) - - def test_dense_rerank_fallback( - self, mock_registry, mock_mapper, mock_config, temp_paths - ): - """dense_rerank_cascade_search should fall back when numpy unavailable.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch("codexlens.search.chain_search.NUMPY_AVAILABLE", False): - with patch.object(engine, "search") as mock_search: - mock_search.return_value = ChainSearchResult( - query="test", - results=[], - symbols=[], - stats=SearchStats(), - ) - - result = engine.dense_rerank_cascade_search( - "query", temp_paths / "src", - ) - - mock_search.assert_called_once() - - -# ============================================================================= -# Tests: cascade_search (unified router) -# ============================================================================= - - -class TestCascadeRouter: - """Tests for cascade_search() strategy routing.""" - - def test_cascade_router_binary( - self, mock_registry, mock_mapper, mock_config, temp_paths - ): - """cascade_search with strategy='binary' should route to binary_cascade_search.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch.object(engine, "binary_cascade_search") as mock_binary: - mock_binary.return_value = ChainSearchResult( - query="test", results=[], symbols=[], stats=SearchStats() - ) - - engine.cascade_search( - "query", temp_paths / "src", strategy="binary" - ) - - mock_binary.assert_called_once() - - def test_cascade_router_binary_rerank( - self, mock_registry, mock_mapper, mock_config, temp_paths - ): - """cascade_search with strategy='binary_rerank' should route correctly.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch.object(engine, "binary_rerank_cascade_search") as mock_rerank: - mock_rerank.return_value = ChainSearchResult( - query="test", results=[], symbols=[], stats=SearchStats() - ) - - engine.cascade_search( - "query", temp_paths / "src", strategy="binary_rerank" - ) - - mock_rerank.assert_called_once() - - def test_cascade_router_dense_rerank( - self, mock_registry, mock_mapper, mock_config, temp_paths - ): - """cascade_search with strategy='dense_rerank' should route correctly.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch.object(engine, "dense_rerank_cascade_search") as mock_dense: - mock_dense.return_value = ChainSearchResult( - query="test", results=[], symbols=[], stats=SearchStats() - ) - - engine.cascade_search( - "query", temp_paths / "src", strategy="dense_rerank" - ) - - mock_dense.assert_called_once() - - def test_cascade_router_staged( - self, mock_registry, mock_mapper, mock_config, temp_paths - ): - """cascade_search with strategy='staged' should route to staged_cascade_search.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch.object(engine, "staged_cascade_search") as mock_staged: - mock_staged.return_value = ChainSearchResult( - query="test", results=[], symbols=[], stats=SearchStats() - ) - - engine.cascade_search( - "query", temp_paths / "src", strategy="staged" - ) - - mock_staged.assert_called_once() - - def test_cascade_router_config_default( - self, mock_registry, mock_mapper, temp_paths - ): - """cascade_search with no strategy param should use config cascade_strategy.""" - config = MagicMock(spec=Config) - config.cascade_strategy = "binary_rerank" - config.cascade_coarse_k = 100 - config.cascade_fine_k = 10 - - engine = ChainSearchEngine(mock_registry, mock_mapper, config=config) - - with patch.object(engine, "binary_rerank_cascade_search") as mock_rerank: - mock_rerank.return_value = ChainSearchResult( - query="test", results=[], symbols=[], stats=SearchStats() - ) - - # No strategy param -> reads from config - engine.cascade_search("query", temp_paths / "src") - - mock_rerank.assert_called_once() - - def test_cascade_router_invalid_fallback( - self, mock_registry, mock_mapper, mock_config, temp_paths - ): - """cascade_search with invalid strategy should default to 'binary'.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch.object(engine, "binary_cascade_search") as mock_binary: - mock_binary.return_value = ChainSearchResult( - query="test", results=[], symbols=[], stats=SearchStats() - ) - - engine.cascade_search( - "query", temp_paths / "src", strategy="nonexistent" - ) - - mock_binary.assert_called_once() diff --git a/codex-lens/tests/test_chain_search.py b/codex-lens/tests/test_chain_search.py deleted file mode 100644 index 3e498e43..00000000 --- a/codex-lens/tests/test_chain_search.py +++ /dev/null @@ -1,1634 +0,0 @@ -import logging -import os -import sqlite3 -import tempfile -from pathlib import Path -from unittest.mock import MagicMock - -import pytest - -from codexlens.config import ( - BINARY_VECTORS_MMAP_NAME, - Config, - VECTORS_HNSW_NAME, - VECTORS_META_DB_NAME, -) -from codexlens.entities import SearchResult, Symbol -import codexlens.search.chain_search as chain_search_module -from codexlens.search.chain_search import ( - ChainSearchEngine, - ChainSearchResult, - SearchOptions, - SearchStats, -) -from codexlens.storage.global_index import GlobalSymbolIndex -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - - -@pytest.fixture() -def temp_paths(): - tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) - root = Path(tmpdir.name) - yield root - try: - tmpdir.cleanup() - except (PermissionError, OSError): - pass - - -def test_symbol_filtering_handles_path_failures(monkeypatch: pytest.MonkeyPatch, caplog, temp_paths: Path) -> None: - project_root = temp_paths / "project" - (project_root / "src").mkdir(parents=True, exist_ok=True) - - index_root = temp_paths / "indexes" - mapper = PathMapper(index_root=index_root) - index_db_path = mapper.source_to_index_db(project_root) - index_db_path.parent.mkdir(parents=True, exist_ok=True) - index_db_path.write_text("", encoding="utf-8") # existence is enough for _find_start_index - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - project_info = registry.register_project(project_root, mapper.source_to_index_dir(project_root)) - - global_db_path = project_info.index_root / GlobalSymbolIndex.DEFAULT_DB_NAME - global_index = GlobalSymbolIndex(global_db_path, project_id=project_info.id) - global_index.initialize() - - valid_file = project_root / "src" / "auth.py" - valid_sym = Symbol(name="AuthManager", kind="class", range=(1, 2), file=str(valid_file)) - bad_null = Symbol(name="BadNull", kind="class", range=(1, 2), file="bad\0path.py") - bad_relative = Symbol(name="BadRelative", kind="class", range=(1, 2), file="relative/path.py") - - candidates = [valid_sym, bad_null, bad_relative] - - if os.name == "nt": - root_drive, _ = os.path.splitdrive(str(project_root.resolve())) - other_drive = "C:" if root_drive.lower() != "c:" else "D:" - candidates.append( - Symbol(name="CrossDrive", kind="class", range=(1, 2), file=f"{other_drive}\\other\\file.py") - ) - - def fake_search(self, name: str, kind=None, limit: int = 20, prefix_mode: bool = False): - return candidates - - monkeypatch.setattr(GlobalSymbolIndex, "search", fake_search) - - config = Config(data_dir=temp_paths / "data", global_symbol_index_enabled=True) - engine = ChainSearchEngine(registry, mapper, config=config) - engine._search_symbols_parallel = MagicMock(side_effect=AssertionError("should not traverse chain")) - - caplog.set_level(logging.DEBUG, logger="codexlens.search.chain_search") - symbols = engine.search_symbols( - "Auth", - project_root, - options=SearchOptions(depth=5, total_limit=10), - ) - - assert [s.name for s in symbols] == ["AuthManager"] - assert "BadNull" in caplog.text - assert "BadRelative" in caplog.text - if os.name == "nt": - assert "CrossDrive" in caplog.text - - -def test_cascade_search_strategy_routing(temp_paths: Path) -> None: - """Test cascade_search() routes to correct strategy implementation.""" - from unittest.mock import patch - from codexlens.search.chain_search import ChainSearchResult, SearchStats - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config(data_dir=temp_paths / "data") - - engine = ChainSearchEngine(registry, mapper, config=config) - source_path = temp_paths / "src" - - # Test strategy='staged' routing - with patch.object(engine, "staged_cascade_search") as mock_staged: - mock_staged.return_value = ChainSearchResult( - query="query", results=[], symbols=[], stats=SearchStats() - ) - engine.cascade_search("query", source_path, strategy="staged") - mock_staged.assert_called_once() - - # Test strategy='binary' routing - with patch.object(engine, "binary_cascade_search") as mock_binary: - mock_binary.return_value = ChainSearchResult( - query="query", results=[], symbols=[], stats=SearchStats() - ) - engine.cascade_search("query", source_path, strategy="binary") - mock_binary.assert_called_once() - - # Test strategy='binary_rerank' routing - with patch.object(engine, "binary_rerank_cascade_search") as mock_br: - mock_br.return_value = ChainSearchResult( - query="query", results=[], symbols=[], stats=SearchStats() - ) - engine.cascade_search("query", source_path, strategy="binary_rerank") - mock_br.assert_called_once() - - # Test strategy='dense_rerank' routing - with patch.object(engine, "dense_rerank_cascade_search") as mock_dr: - mock_dr.return_value = ChainSearchResult( - query="query", results=[], symbols=[], stats=SearchStats() - ) - engine.cascade_search("query", source_path, strategy="dense_rerank") - mock_dr.assert_called_once() - - # Test default routing (no strategy specified) - defaults to binary - with patch.object(engine, "binary_cascade_search") as mock_default: - mock_default.return_value = ChainSearchResult( - query="query", results=[], symbols=[], stats=SearchStats() - ) - engine.cascade_search("query", source_path) - mock_default.assert_called_once() - - -def test_cascade_search_invalid_strategy(temp_paths: Path) -> None: - """Test cascade_search() defaults to 'binary' for invalid strategy.""" - from unittest.mock import patch - from codexlens.search.chain_search import ChainSearchResult, SearchStats - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config(data_dir=temp_paths / "data") - - engine = ChainSearchEngine(registry, mapper, config=config) - source_path = temp_paths / "src" - - # Invalid strategy should default to binary - with patch.object(engine, "binary_cascade_search") as mock_binary: - mock_binary.return_value = ChainSearchResult( - query="query", results=[], symbols=[], stats=SearchStats() - ) - engine.cascade_search("query", source_path, strategy="invalid_strategy") - mock_binary.assert_called_once() - - -def test_vector_warmup_uses_embedding_config(monkeypatch: pytest.MonkeyPatch, temp_paths: Path) -> None: - calls: list[dict[str, object]] = [] - - def fake_get_embedder(**kwargs: object) -> object: - calls.append(dict(kwargs)) - return object() - - import codexlens.semantic.factory as factory - - monkeypatch.setattr(factory, "get_embedder", fake_get_embedder) - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config( - data_dir=temp_paths / "data", - embedding_backend="fastembed", - embedding_model="fast", - embedding_use_gpu=False, - ) - - engine = ChainSearchEngine(registry, mapper, config=config) - monkeypatch.setattr(engine, "_get_executor", lambda _workers: MagicMock()) - - engine._search_parallel([], "query", SearchOptions(enable_vector=True)) - - assert calls == [ - { - "backend": "fastembed", - "profile": "fast", - "use_gpu": False, - } - ] - - -def test_search_single_index_passes_config_to_hybrid_engine( - monkeypatch: pytest.MonkeyPatch, temp_paths: Path -) -> None: - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config(data_dir=temp_paths / "data", embedding_backend="fastembed", embedding_model="code") - - engine = ChainSearchEngine(registry, mapper, config=config) - index_path = temp_paths / "indexes" / "project" / "_index.db" - index_path.parent.mkdir(parents=True, exist_ok=True) - index_path.write_bytes(b"\x00" * 128) - - captured: dict[str, object] = {} - - class FakeHybridSearchEngine: - def __init__(self, *, weights=None, config=None): - captured["weights"] = weights - captured["config"] = config - - def search(self, *_args, **_kwargs): - return [SearchResult(path="src/app.py", score=0.9, excerpt="hit")] - - monkeypatch.setattr(chain_search_module, "HybridSearchEngine", FakeHybridSearchEngine) - - results = engine._search_single_index( - index_path, - "auth flow", - limit=5, - hybrid_mode=True, - enable_vector=True, - hybrid_weights={"vector": 1.0}, - ) - - assert captured["config"] is config - assert captured["weights"] == {"vector": 1.0} - assert len(results) == 1 - assert results[0].path == "src/app.py" - - -def test_search_parallel_reuses_shared_hybrid_engine( - monkeypatch: pytest.MonkeyPatch, - temp_paths: Path, -) -> None: - from concurrent.futures import Future - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config(data_dir=temp_paths / "data") - - engine = ChainSearchEngine(registry, mapper, config=config) - index_root = temp_paths / "indexes" / "project" - index_a = index_root / "src" / "_index.db" - index_b = index_root / "tests" / "_index.db" - index_a.parent.mkdir(parents=True, exist_ok=True) - index_b.parent.mkdir(parents=True, exist_ok=True) - index_a.write_bytes(b"\x00" * 128) - index_b.write_bytes(b"\x00" * 128) - - created_engines: list[object] = [] - search_calls: list[tuple[object, Path]] = [] - - class FakeHybridSearchEngine: - def __init__(self, *, weights=None, config=None): - self.weights = weights - self.config = config - created_engines.append(self) - - def search(self, index_path, *_args, **_kwargs): - search_calls.append((self, index_path)) - return [SearchResult(path=str(index_path), score=0.9, excerpt="hit")] - - class ImmediateExecutor: - def submit(self, fn, *args): - future: Future = Future() - try: - future.set_result(fn(*args)) - except Exception as exc: - future.set_exception(exc) - return future - - monkeypatch.setattr(chain_search_module, "HybridSearchEngine", FakeHybridSearchEngine) - monkeypatch.setattr(engine, "_get_executor", lambda _workers: ImmediateExecutor()) - - results, stats = engine._search_parallel( - [index_a, index_b], - "auth flow", - SearchOptions( - hybrid_mode=True, - enable_vector=True, - limit_per_dir=5, - hybrid_weights={"vector": 1.0}, - ), - ) - - assert stats.errors == [] - assert len(created_engines) == 1 - assert [path for _, path in search_calls] == [index_a, index_b] - assert all(shared is created_engines[0] for shared, _ in search_calls) - assert len(results) == 2 - - -def test_search_injects_feature_query_anchors_into_merge( - monkeypatch: pytest.MonkeyPatch, - temp_paths: Path, -) -> None: - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config(data_dir=temp_paths / "data") - engine = ChainSearchEngine(registry, mapper, config=config) - - source_path = temp_paths / "project" - start_index = temp_paths / "indexes" / "project" / "_index.db" - start_index.parent.mkdir(parents=True, exist_ok=True) - start_index.write_text("", encoding="utf-8") - - feature_path = str(source_path / "src" / "tools" / "smart-search.ts") - platform_path = str(source_path / "src" / "utils" / "path-resolver.ts") - anchor_result = SearchResult( - path=feature_path, - score=8.0, - excerpt="smart search anchor", - metadata={"feature_query_hint": "smart search"}, - ) - - monkeypatch.setattr(engine, "_find_start_index", lambda _source_path: start_index) - monkeypatch.setattr( - engine, - "_collect_index_paths", - lambda _start_index, _options: [start_index], - ) - monkeypatch.setattr( - engine, - "_search_parallel", - lambda *_args, **_kwargs: ( - [ - SearchResult( - path=platform_path, - score=0.9, - excerpt="platform hit", - ) - ], - SearchStats(), - ), - ) - monkeypatch.setattr(engine, "_search_symbols_parallel", lambda *_args, **_kwargs: []) - collected_queries: list[str] = [] - monkeypatch.setattr( - engine, - "_collect_query_feature_anchor_results", - lambda query, *_args, **_kwargs: ( - collected_queries.append(query), - [anchor_result], - )[1], - ) - - result = engine.search( - "parse CodexLens JSON output strip ANSI smart_search", - source_path, - options=SearchOptions( - total_limit=5, - hybrid_mode=True, - enable_fuzzy=False, - enable_vector=True, - ), - ) - - assert collected_queries == ["parse CodexLens JSON output strip ANSI smart_search"] - result_by_path = {item.path: item for item in result.results} - assert feature_path in result_by_path - assert platform_path in result_by_path - assert result_by_path[feature_path].metadata["feature_query_anchor"] is True - assert result_by_path[feature_path].metadata["feature_query_hint"] == "smart search" - - -def test_group_index_paths_by_dense_root(temp_paths: Path) -> None: - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - engine = ChainSearchEngine(registry, mapper, config=Config(data_dir=temp_paths / "data")) - - dense_root_a = temp_paths / "indexes" / "project-a" - dense_root_b = temp_paths / "indexes" / "project-b" - orphan_root = temp_paths / "indexes" / "orphan" / "pkg" - - dense_root_a.mkdir(parents=True, exist_ok=True) - dense_root_b.mkdir(parents=True, exist_ok=True) - orphan_root.mkdir(parents=True, exist_ok=True) - (dense_root_a / VECTORS_HNSW_NAME).write_bytes(b"a") - (dense_root_b / VECTORS_HNSW_NAME).write_bytes(b"b") - - index_a = dense_root_a / "src" / "_index.db" - index_b = dense_root_b / "tests" / "_index.db" - orphan_index = orphan_root / "_index.db" - index_a.parent.mkdir(parents=True, exist_ok=True) - index_b.parent.mkdir(parents=True, exist_ok=True) - index_a.write_text("", encoding="utf-8") - index_b.write_text("", encoding="utf-8") - orphan_index.write_text("", encoding="utf-8") - - roots, ungrouped = engine._group_index_paths_by_dense_root( - [index_a, orphan_index, index_b] - ) - - assert roots == [dense_root_a, dense_root_b] - assert ungrouped == [orphan_index] - assert engine._find_nearest_dense_hnsw_root(index_a.parent) == dense_root_a - assert engine._find_nearest_dense_hnsw_root(orphan_index.parent) is None - - -def test_stage1_binary_search_merges_multiple_centralized_roots( - monkeypatch: pytest.MonkeyPatch, - temp_paths: Path, -) -> None: - import numpy as np - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) - engine = ChainSearchEngine(registry, mapper, config=config) - - root_a = temp_paths / "indexes" / "project-a" - root_b = temp_paths / "indexes" / "project-b" - for root in (root_a, root_b): - root.mkdir(parents=True, exist_ok=True) - (root / BINARY_VECTORS_MMAP_NAME).write_bytes(b"binary") - (root / VECTORS_META_DB_NAME).write_bytes(b"meta") - - index_a = root_a / "src" / "_index.db" - index_b = root_b / "src" / "_index.db" - index_a.parent.mkdir(parents=True, exist_ok=True) - index_b.parent.mkdir(parents=True, exist_ok=True) - index_a.write_text("", encoding="utf-8") - index_b.write_text("", encoding="utf-8") - - class FakeBinarySearcher: - def __init__(self, root: Path) -> None: - self.root = root - self.backend = "fastembed" - self.model = None - self.model_profile = "code" - - def search(self, _query_dense, top_k: int): - return [(1, 8)] if self.root == root_a else [(2, 16)] - - class FakeEmbedder: - def embed_to_numpy(self, _queries): - return np.ones((1, 4), dtype=np.float32) - - class FakeVectorMetadataStore: - def __init__(self, path: Path) -> None: - self.path = Path(path) - - def get_chunks_by_ids(self, chunk_ids): - return [ - { - "id": chunk_id, - "file_path": str(self.path.parent / f"file{chunk_id}.py"), - "content": f"chunk {chunk_id}", - "metadata": "{\"start_line\": 1, \"end_line\": 2}", - "category": "code", - } - for chunk_id in chunk_ids - ] - - import codexlens.semantic.embedder as embedder_module - from codexlens.search.chain_search import SearchStats - - monkeypatch.setattr( - engine, - "_get_centralized_binary_searcher", - lambda root: FakeBinarySearcher(root), - ) - monkeypatch.setattr(embedder_module, "get_embedder", lambda **_kwargs: FakeEmbedder()) - monkeypatch.setattr(chain_search_module, "VectorMetadataStore", FakeVectorMetadataStore) - - coarse_results, stage2_root = engine._stage1_binary_search( - "binary query", - [index_a, index_b], - coarse_k=5, - stats=SearchStats(), - index_root=index_a.parent, - ) - - assert stage2_root is None - assert len(coarse_results) == 2 - assert {Path(result.path).name for result in coarse_results} == {"file1.py", "file2.py"} - - -def test_stage1_binary_search_keeps_duplicate_chunk_ids_isolated_per_root( - monkeypatch: pytest.MonkeyPatch, - temp_paths: Path, -) -> None: - import numpy as np - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) - engine = ChainSearchEngine(registry, mapper, config=config) - - root_a = temp_paths / "indexes" / "project-a" - root_b = temp_paths / "indexes" / "project-b" - for root in (root_a, root_b): - root.mkdir(parents=True, exist_ok=True) - (root / BINARY_VECTORS_MMAP_NAME).write_bytes(b"binary") - (root / VECTORS_META_DB_NAME).write_bytes(b"meta") - - index_a = root_a / "src" / "_index.db" - index_b = root_b / "src" / "_index.db" - index_a.parent.mkdir(parents=True, exist_ok=True) - index_b.parent.mkdir(parents=True, exist_ok=True) - index_a.write_text("", encoding="utf-8") - index_b.write_text("", encoding="utf-8") - - class FakeBinarySearcher: - def __init__(self, root: Path) -> None: - self.root = root - self.backend = "fastembed" - self.model = None - self.model_profile = "code" - - def search(self, _query_dense, top_k: int): - return [(1, 8)] if self.root == root_a else [(1, 16)] - - class FakeEmbedder: - def embed_to_numpy(self, _queries): - return np.ones((1, 4), dtype=np.float32) - - class FakeVectorMetadataStore: - def __init__(self, path: Path) -> None: - self.path = Path(path) - - def get_chunks_by_ids(self, chunk_ids): - return [ - { - "id": chunk_id, - "file_path": str(self.path.parent / f"{self.path.parent.name}-file{chunk_id}.py"), - "content": f"chunk {self.path.parent.name}-{chunk_id}", - "metadata": "{\"start_line\": 1, \"end_line\": 2}", - "category": "code", - } - for chunk_id in chunk_ids - ] - - import codexlens.semantic.embedder as embedder_module - from codexlens.search.chain_search import SearchStats - - monkeypatch.setattr( - engine, - "_get_centralized_binary_searcher", - lambda root: FakeBinarySearcher(root), - ) - monkeypatch.setattr(embedder_module, "get_embedder", lambda **_kwargs: FakeEmbedder()) - monkeypatch.setattr(chain_search_module, "VectorMetadataStore", FakeVectorMetadataStore) - - coarse_results, stage2_root = engine._stage1_binary_search( - "binary query", - [index_a, index_b], - coarse_k=5, - stats=SearchStats(), - index_root=index_a.parent, - ) - - assert stage2_root is None - scores_by_name = {Path(result.path).name: result.score for result in coarse_results} - assert scores_by_name["project-a-file1.py"] == pytest.approx(1.0 - (8.0 / 256.0)) - assert scores_by_name["project-b-file1.py"] == pytest.approx(1.0 - (16.0 / 256.0)) - - - -def test_collect_index_paths_includes_nested_registered_project_roots( - temp_paths: Path, -) -> None: - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - engine = ChainSearchEngine(registry, mapper, config=Config(data_dir=temp_paths / "data")) - - workspace_root = temp_paths / "workspace" - child_root = workspace_root / "packages" / "child" - ignored_root = workspace_root / "dist" / "generated" - - workspace_index = mapper.source_to_index_db(workspace_root) - child_index = mapper.source_to_index_db(child_root) - ignored_index = mapper.source_to_index_db(ignored_root) - - for index_path in (workspace_index, child_index, ignored_index): - index_path.parent.mkdir(parents=True, exist_ok=True) - index_path.write_text("", encoding="utf-8") - - workspace_project = registry.register_project( - workspace_root, - mapper.source_to_index_dir(workspace_root), - ) - child_project = registry.register_project( - child_root, - mapper.source_to_index_dir(child_root), - ) - ignored_project = registry.register_project( - ignored_root, - mapper.source_to_index_dir(ignored_root), - ) - - registry.register_dir( - workspace_project.id, - workspace_root, - workspace_index, - depth=0, - ) - registry.register_dir( - child_project.id, - child_root, - child_index, - depth=0, - ) - registry.register_dir( - ignored_project.id, - ignored_root, - ignored_index, - depth=0, - ) - - collected = engine._collect_index_paths(workspace_index, depth=-1) - - assert collected == [workspace_index, child_index] - - -def test_collect_index_paths_respects_depth_for_nested_registered_project_roots( - temp_paths: Path, -) -> None: - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - engine = ChainSearchEngine(registry, mapper, config=Config(data_dir=temp_paths / "data")) - - workspace_root = temp_paths / "workspace" - direct_child_root = workspace_root / "apps" - deep_child_root = workspace_root / "packages" / "deep" / "child" - - workspace_index = mapper.source_to_index_db(workspace_root) - direct_child_index = mapper.source_to_index_db(direct_child_root) - deep_child_index = mapper.source_to_index_db(deep_child_root) - - for index_path in (workspace_index, direct_child_index, deep_child_index): - index_path.parent.mkdir(parents=True, exist_ok=True) - index_path.write_text("", encoding="utf-8") - - workspace_project = registry.register_project( - workspace_root, - mapper.source_to_index_dir(workspace_root), - ) - direct_child_project = registry.register_project( - direct_child_root, - mapper.source_to_index_dir(direct_child_root), - ) - deep_child_project = registry.register_project( - deep_child_root, - mapper.source_to_index_dir(deep_child_root), - ) - - registry.register_dir(workspace_project.id, workspace_root, workspace_index, depth=0) - registry.register_dir( - direct_child_project.id, - direct_child_root, - direct_child_index, - depth=0, - ) - registry.register_dir( - deep_child_project.id, - deep_child_root, - deep_child_index, - depth=0, - ) - - collected = engine._collect_index_paths(workspace_index, depth=1) - - assert collected == [workspace_index, direct_child_index] - - -def test_binary_rerank_cascade_search_merges_multiple_centralized_roots( - monkeypatch: pytest.MonkeyPatch, - temp_paths: Path, -) -> None: - import numpy as np - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) - engine = ChainSearchEngine(registry, mapper, config=config) - - root_a = temp_paths / "indexes" / "project-a" - root_b = temp_paths / "indexes" / "project-b" - for root in (root_a, root_b): - root.mkdir(parents=True, exist_ok=True) - (root / BINARY_VECTORS_MMAP_NAME).write_bytes(b"binary") - (root / VECTORS_META_DB_NAME).write_bytes(b"meta") - - index_a = root_a / "src" / "_index.db" - index_b = root_b / "src" / "_index.db" - index_a.parent.mkdir(parents=True, exist_ok=True) - index_b.parent.mkdir(parents=True, exist_ok=True) - index_a.write_text("", encoding="utf-8") - index_b.write_text("", encoding="utf-8") - - class FakeBinarySearcher: - def __init__(self, root: Path) -> None: - self.root = root - self.backend = "fastembed" - self.model = None - self.model_profile = "code" - - def search(self, _query_dense, top_k: int): - return [(1, 8)] if self.root == root_a else [(2, 16)] - - class FakeEmbedder: - def embed_to_numpy(self, _queries): - return np.ones((1, 4), dtype=np.float32) - - class FakeVectorMetadataStore: - def __init__(self, path: Path) -> None: - self.path = Path(path) - - def get_chunks_by_ids(self, chunk_ids): - return [ - { - "chunk_id": chunk_id, - "file_path": str(self.path.parent / f"file{chunk_id}.py"), - "content": f"chunk {chunk_id}", - "metadata": "{}", - "category": "code", - } - for chunk_id in chunk_ids - ] - - import codexlens.semantic.embedder as embedder_module - - monkeypatch.setattr(engine, "_find_start_index", lambda _source_path: index_a) - monkeypatch.setattr(engine, "_collect_index_paths", lambda _start_index, _depth: [index_a, index_b]) - monkeypatch.setattr( - engine, - "_get_centralized_binary_searcher", - lambda root: FakeBinarySearcher(root), - ) - monkeypatch.setattr(embedder_module, "get_embedder", lambda **_kwargs: FakeEmbedder()) - monkeypatch.setattr(chain_search_module, "VectorMetadataStore", FakeVectorMetadataStore) - monkeypatch.setattr(engine, "_cross_encoder_rerank", lambda _query, results, top_k: results[:top_k]) - monkeypatch.setattr(engine, "search", lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected fallback"))) - - result = engine.binary_rerank_cascade_search( - "binary query", - index_a.parent, - k=5, - coarse_k=5, - ) - - assert len(result.results) == 2 - assert {Path(item.path).name for item in result.results} == {"file1.py", "file2.py"} - - -def test_dense_rerank_cascade_search_overfetches_and_applies_path_penalties( - monkeypatch: pytest.MonkeyPatch, - temp_paths: Path, -) -> None: - import numpy as np - import codexlens.semantic.ann_index as ann_index_module - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config( - data_dir=temp_paths / "data", - embedding_use_gpu=False, - reranker_top_k=3, - test_file_penalty=0.35, - generated_file_penalty=0.35, - ) - engine = ChainSearchEngine(registry, mapper, config=config) - - dense_root = temp_paths / "indexes" / "project" - dense_root.mkdir(parents=True, exist_ok=True) - (dense_root / VECTORS_HNSW_NAME).write_bytes(b"hnsw") - - meta_db_path = dense_root / VECTORS_META_DB_NAME - conn = sqlite3.connect(meta_db_path) - conn.execute( - """ - CREATE TABLE chunk_metadata ( - chunk_id INTEGER PRIMARY KEY, - file_path TEXT NOT NULL, - content TEXT NOT NULL, - start_line INTEGER, - end_line INTEGER - ) - """ - ) - conn.executemany( - """ - INSERT INTO chunk_metadata (chunk_id, file_path, content, start_line, end_line) - VALUES (?, ?, ?, ?, ?) - """, - [ - ( - 1, - "project/tests/test_auth.py", - "def test_auth_flow():\n pass", - 1, - 2, - ), - ( - 2, - "project/src/auth.py", - "def auth_flow():\n return True", - 1, - 2, - ), - ( - 3, - "project/dist/bundle.js", - "function authFlow(){return true;}", - 1, - 1, - ), - ], - ) - conn.commit() - conn.close() - - index_path = dense_root / "src" / "_index.db" - index_path.parent.mkdir(parents=True, exist_ok=True) - index_path.write_text("", encoding="utf-8") - - class FakeANNIndex: - def __init__(self, root: Path, dim: int) -> None: - self.root = root - self.dim = dim - - @classmethod - def create_central(cls, *, index_root: Path, dim: int): - return cls(index_root, dim) - - def load(self) -> bool: - return True - - def count(self) -> int: - return 3 - - def search(self, _query_dense, top_k: int): - ids = [1, 2, 3][:top_k] - distances = [0.01, 0.02, 0.03][:top_k] - return ids, distances - - rerank_calls: list[int] = [] - - def fake_cross_encoder(_query: str, results: list[SearchResult], top_k: int): - rerank_calls.append(top_k) - return results[:top_k] - - monkeypatch.setattr(engine, "_find_start_index", lambda _source_path: index_path) - monkeypatch.setattr(engine, "_collect_index_paths", lambda _start_index, _depth: [index_path]) - monkeypatch.setattr(engine, "_embed_dense_query", lambda *_args, **_kwargs: np.ones(4, dtype=np.float32)) - monkeypatch.setattr(engine, "_cross_encoder_rerank", fake_cross_encoder) - monkeypatch.setattr( - engine, - "search", - lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected fallback")), - ) - monkeypatch.setattr(ann_index_module, "ANNIndex", FakeANNIndex) - - result = engine.dense_rerank_cascade_search( - "auth", - index_path.parent, - k=1, - coarse_k=3, - ) - - assert rerank_calls == [3] - assert len(result.results) == 1 - assert result.results[0].path.endswith("src\\auth.py") or result.results[0].path.endswith("src/auth.py") - assert result.results[0].metadata == {} - - -def test_collect_query_feature_anchor_results_uses_explicit_file_hints( - monkeypatch: pytest.MonkeyPatch, - temp_paths: Path, -) -> None: - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) - engine = ChainSearchEngine(registry, mapper, config=config) - - recorded_queries: list[str] = [] - - def fake_search(query: str, _source_path: Path, options: SearchOptions | None = None): - recorded_queries.append(query) - return ChainSearchResult( - query=query, - results=[ - SearchResult( - path="/repo/src/tools/smart-search.ts", - score=8.7, - excerpt="smart search path anchor", - ), - SearchResult( - path="/repo/src/tools/codex-lens-lsp.ts", - score=7.4, - excerpt="platform term overlap", - ), - ], - symbols=[], - stats=SearchStats(), - ) - - monkeypatch.setattr(engine, "search", fake_search) - - anchors = engine._collect_query_feature_anchor_results( - "parse CodexLens JSON output strip ANSI smart_search", - temp_paths, - SearchOptions(), - limit=4, - ) - - assert recorded_queries == ["smart search"] - assert [Path(result.path).name for result in anchors] == ["smart-search.ts"] - assert anchors[0].metadata["feature_query_anchor"] is True - assert anchors[0].metadata["feature_query_hint_tokens"] == ["smart", "search"] - - -def test_collect_query_feature_anchor_results_falls_back_to_full_lexical_query( - monkeypatch: pytest.MonkeyPatch, - temp_paths: Path, -) -> None: - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) - engine = ChainSearchEngine(registry, mapper, config=config) - - recorded_calls: list[tuple[str, bool]] = [] - full_query = "EMBEDDING_BACKEND and RERANKER_BACKEND environment variables" - - def fake_search(query: str, _source_path: Path, options: SearchOptions | None = None): - recorded_calls.append((query, bool(options.inject_feature_anchors) if options else True)) - if query == full_query: - return ChainSearchResult( - query=query, - results=[ - SearchResult( - path="/repo/src/codexlens/env_config.py", - score=8.5, - excerpt="ENV vars", - ), - SearchResult( - path="/repo/src/codexlens/config.py", - score=8.1, - excerpt="backend config", - ), - ], - symbols=[], - stats=SearchStats(), - ) - - return ChainSearchResult( - query=query, - results=[ - SearchResult( - path="/repo/src/codexlens/env_config.py", - score=7.0, - excerpt="hint candidate", - ) - ], - symbols=[], - stats=SearchStats(), - ) - - monkeypatch.setattr(engine, "search", fake_search) - - anchors = engine._collect_query_feature_anchor_results( - full_query, - temp_paths, - SearchOptions(), - limit=2, - ) - - assert recorded_calls == [ - ("embedding backend", False), - ("reranker backend", False), - (full_query, False), - ] - assert [Path(result.path).name for result in anchors] == ["env_config.py", "config.py"] - assert anchors[0].metadata["feature_query_seed_kind"] == "lexical_query" - assert anchors[0].metadata["feature_query_hint"] == full_query - - -def test_stage3_cluster_prune_preserves_feature_query_anchors(temp_paths: Path) -> None: - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) - config.staged_clustering_strategy = "score" - engine = ChainSearchEngine(registry, mapper, config=config) - - anchor = SearchResult( - path="/repo/src/tools/smart-search.ts", - score=0.02, - excerpt="parse JSON output and strip ANSI", - metadata={ - "feature_query_anchor": True, - "feature_query_hint": "smart search", - "feature_query_hint_tokens": ["smart", "search"], - }, - ) - others = [ - SearchResult( - path=f"/repo/src/feature-{index}.ts", - score=0.9 - (0.05 * index), - excerpt="generic feature implementation", - ) - for index in range(6) - ] - - clustered = engine._stage3_cluster_prune( - [anchor, *others], - target_count=4, - query="parse CodexLens JSON output strip ANSI smart_search", - ) - - assert len(clustered) == 4 - assert any(Path(result.path).name == "smart-search.ts" for result in clustered) - - -def test_dense_rerank_cascade_search_interleaves_mixed_embedding_groups( - monkeypatch: pytest.MonkeyPatch, - temp_paths: Path, -) -> None: - import numpy as np - import codexlens.semantic.ann_index as ann_index_module - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) - engine = ChainSearchEngine(registry, mapper, config=config) - - root_a = temp_paths / "indexes" / "project-a" - root_b = temp_paths / "indexes" / "project-b" - for root in (root_a, root_b): - root.mkdir(parents=True, exist_ok=True) - (root / VECTORS_HNSW_NAME).write_bytes(b"hnsw") - - for meta_db_path, rows in ( - ( - root_a / VECTORS_META_DB_NAME, - [ - (1, str(root_a / "src" / "a.py"), "def a():\n return 1", 1, 2), - (3, str(root_a / "src" / "a2.py"), "def a2():\n return 2", 1, 2), - ], - ), - ( - root_b / VECTORS_META_DB_NAME, - [ - (2, str(root_b / "src" / "b.py"), "def b():\n return 3", 1, 2), - ], - ), - ): - conn = sqlite3.connect(meta_db_path) - conn.execute( - """ - CREATE TABLE chunk_metadata ( - chunk_id INTEGER PRIMARY KEY, - file_path TEXT NOT NULL, - content TEXT NOT NULL, - start_line INTEGER, - end_line INTEGER - ) - """ - ) - conn.executemany( - """ - INSERT INTO chunk_metadata (chunk_id, file_path, content, start_line, end_line) - VALUES (?, ?, ?, ?, ?) - """, - rows, - ) - conn.commit() - conn.close() - - index_a = root_a / "src" / "_index.db" - index_b = root_b / "src" / "_index.db" - index_a.parent.mkdir(parents=True, exist_ok=True) - index_b.parent.mkdir(parents=True, exist_ok=True) - index_a.write_text("", encoding="utf-8") - index_b.write_text("", encoding="utf-8") - - class FakeANNIndex: - def __init__(self, index_path: Path, dim: int) -> None: - source = Path(index_path) - self.root = source if source.name != "_index.db" else source.parent - self.dim = dim - - @classmethod - def create_central(cls, *, index_root: Path, dim: int): - return cls(index_root, dim) - - def load(self) -> bool: - return True - - def count(self) -> int: - return 2 if self.root == root_a else 1 - - def search(self, _query_dense, top_k: int): - if self.root == root_a: - return [1, 3][:top_k], [0.01, 0.011][:top_k] - return [2][:top_k], [0.02][:top_k] - - monkeypatch.setattr(engine, "_find_start_index", lambda _source_path: index_a) - monkeypatch.setattr(engine, "_collect_index_paths", lambda _start_index, _depth: [index_a, index_b]) - monkeypatch.setattr( - engine, - "_resolve_dense_embedding_settings", - lambda *, index_root: ( - ("fastembed", "code", False) - if Path(index_root) == root_a - else ("litellm", "qwen3-embedding-sf", False) - ), - ) - monkeypatch.setattr( - engine, - "_embed_dense_query", - lambda _query, *, index_root=None, query_cache=None: ( - np.ones(4, dtype=np.float32) - if Path(index_root) == root_a - else np.ones(8, dtype=np.float32) - ), - ) - monkeypatch.setattr(engine, "_cross_encoder_rerank", lambda _query, results, top_k: results[:top_k]) - monkeypatch.setattr( - engine, - "search", - lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected fallback")), - ) - monkeypatch.setattr(ann_index_module, "ANNIndex", FakeANNIndex) - - result = engine.dense_rerank_cascade_search( - "route query", - index_a.parent, - k=2, - coarse_k=2, - ) - - assert [Path(item.path).name for item in result.results] == ["a.py", "b.py"] - - -def test_dense_rerank_cascade_search_reuses_cached_dense_indexes( - monkeypatch: pytest.MonkeyPatch, - temp_paths: Path, -) -> None: - import numpy as np - import codexlens.semantic.ann_index as ann_index_module - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) - engine = ChainSearchEngine(registry, mapper, config=config) - - dense_root = temp_paths / "indexes" / "project" - dense_root.mkdir(parents=True, exist_ok=True) - (dense_root / VECTORS_HNSW_NAME).write_bytes(b"hnsw") - - meta_db_path = dense_root / VECTORS_META_DB_NAME - conn = sqlite3.connect(meta_db_path) - conn.execute( - """ - CREATE TABLE chunk_metadata ( - chunk_id INTEGER PRIMARY KEY, - file_path TEXT NOT NULL, - content TEXT NOT NULL, - start_line INTEGER, - end_line INTEGER - ) - """ - ) - conn.execute( - "INSERT INTO chunk_metadata (chunk_id, file_path, content, start_line, end_line) VALUES (?, ?, ?, ?, ?)", - (1, str((temp_paths / "src" / "impl.py").resolve()), "def impl():\n return 1", 1, 2), - ) - conn.commit() - conn.close() - - index_path = dense_root / "src" / "_index.db" - index_path.parent.mkdir(parents=True, exist_ok=True) - index_path.write_text("", encoding="utf-8") - - create_calls: list[tuple[Path, int]] = [] - - class FakeANNIndex: - def __init__(self, root: Path, dim: int) -> None: - self.root = root - self.dim = dim - - @classmethod - def create_central(cls, *, index_root: Path, dim: int): - create_calls.append((Path(index_root), int(dim))) - return cls(index_root, dim) - - def load(self) -> bool: - return True - - def count(self) -> int: - return 1 - - def search(self, _query_dense, top_k: int): - return [1][:top_k], [0.01][:top_k] - - monkeypatch.setattr(engine, "_find_start_index", lambda _source_path: index_path) - monkeypatch.setattr(engine, "_collect_index_paths", lambda _start_index, _depth: [index_path]) - monkeypatch.setattr(engine, "_embed_dense_query", lambda *_args, **_kwargs: np.ones(4, dtype=np.float32)) - monkeypatch.setattr(engine, "_cross_encoder_rerank", lambda _query, results, top_k: results[:top_k]) - monkeypatch.setattr( - engine, - "search", - lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected fallback")), - ) - monkeypatch.setattr(ann_index_module, "ANNIndex", FakeANNIndex) - - first = engine.dense_rerank_cascade_search("route query", index_path.parent, k=1, coarse_k=1) - second = engine.dense_rerank_cascade_search("route query", index_path.parent, k=1, coarse_k=1) - - assert len(first.results) == 1 - assert len(second.results) == 1 - assert create_calls == [(dense_root, 4)] - - -def test_dense_rerank_cascade_search_short_circuits_lexical_priority_queries( - monkeypatch: pytest.MonkeyPatch, - temp_paths: Path, -) -> None: - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config(data_dir=temp_paths / "data") - engine = ChainSearchEngine(registry, mapper, config=config) - - expected = ChainSearchResult( - query="embedding backend fastembed local litellm api config", - results=[SearchResult(path="src/config.py", score=0.9, excerpt="embedding_backend = ...")], - symbols=[], - stats=SearchStats(dirs_searched=3, files_matched=1, time_ms=12.5), - ) - search_calls: list[tuple[str, Path, SearchOptions | None]] = [] - - def fake_search(query: str, source_path: Path, options: SearchOptions | None = None): - search_calls.append((query, source_path, options)) - return expected - - monkeypatch.setattr(engine, "search", fake_search) - monkeypatch.setattr( - engine, - "_find_start_index", - lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("dense path should not run")), - ) - monkeypatch.setattr( - engine, - "_embed_dense_query", - lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("dense query should not run")), - ) - monkeypatch.setattr( - engine, - "_cross_encoder_rerank", - lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("rerank should not run")), - ) - - options = SearchOptions( - depth=2, - max_workers=3, - limit_per_dir=4, - total_limit=7, - include_symbols=True, - files_only=False, - code_only=True, - exclude_extensions=["md"], - inject_feature_anchors=False, - ) - - result = engine.dense_rerank_cascade_search( - "embedding backend fastembed local litellm api config", - temp_paths / "workspace", - k=5, - coarse_k=50, - options=options, - ) - - assert result is not expected - assert result.results == expected.results - assert result.related_results == expected.related_results - assert result.symbols == [] - assert result.stats == expected.stats - assert len(search_calls) == 1 - called_query, called_source_path, lexical_options = search_calls[0] - assert called_query == "embedding backend fastembed local litellm api config" - assert called_source_path == temp_paths / "workspace" - assert lexical_options is not None - assert lexical_options.depth == 2 - assert lexical_options.max_workers == 3 - assert lexical_options.limit_per_dir == 10 - assert lexical_options.total_limit == 20 - assert lexical_options.include_symbols is False - assert lexical_options.enable_vector is False - assert lexical_options.hybrid_mode is False - assert lexical_options.enable_cascade is False - assert lexical_options.code_only is True - assert lexical_options.exclude_extensions == ["md"] - assert lexical_options.inject_feature_anchors is False - - -def test_cross_encoder_rerank_reuses_cached_reranker_instance( - monkeypatch: pytest.MonkeyPatch, - temp_paths: Path, -) -> None: - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config( - data_dir=temp_paths / "data", - enable_cross_encoder_rerank=True, - reranker_backend="onnx", - reranker_use_gpu=False, - ) - engine = ChainSearchEngine(registry, mapper, config=config) - - calls: dict[str, object] = {"check": [], "get": []} - - class DummyReranker: - def score_pairs(self, pairs, batch_size=32): - _ = batch_size - return [1.0 for _ in pairs] - - def fake_check_reranker_available(backend: str): - calls["check"].append(backend) - return True, None - - def fake_get_reranker(*, backend: str, model_name=None, device=None, **kwargs): - calls["get"].append( - { - "backend": backend, - "model_name": model_name, - "device": device, - "kwargs": kwargs, - } - ) - return DummyReranker() - - monkeypatch.setattr( - "codexlens.semantic.reranker.check_reranker_available", - fake_check_reranker_available, - ) - monkeypatch.setattr( - "codexlens.semantic.reranker.get_reranker", - fake_get_reranker, - ) - - results = [ - SearchResult(path=str((temp_paths / f"file_{idx}.py").resolve()), score=1.0 / (idx + 1), excerpt=f"def fn_{idx}(): pass") - for idx in range(3) - ] - - first = engine._cross_encoder_rerank("find function", results, top_k=2) - second = engine._cross_encoder_rerank("find function", results, top_k=2) - - assert len(first) == len(second) == len(results) - assert calls["check"] == ["onnx"] - assert len(calls["get"]) == 1 - get_call = calls["get"][0] - assert isinstance(get_call, dict) - assert get_call["backend"] == "onnx" - assert get_call["kwargs"]["use_gpu"] is False - - -def test_collect_binary_coarse_candidates_interleaves_mixed_dense_fallback_groups( - monkeypatch: pytest.MonkeyPatch, - temp_paths: Path, -) -> None: - import numpy as np - import codexlens.semantic.ann_index as ann_index_module - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) - engine = ChainSearchEngine(registry, mapper, config=config) - - root_a = temp_paths / "indexes" / "project-a" - root_b = temp_paths / "indexes" / "project-b" - for root in (root_a, root_b): - root.mkdir(parents=True, exist_ok=True) - (root / VECTORS_HNSW_NAME).write_bytes(b"hnsw") - - index_a = root_a / "src" / "_index.db" - index_b = root_b / "src" / "_index.db" - index_a.parent.mkdir(parents=True, exist_ok=True) - index_b.parent.mkdir(parents=True, exist_ok=True) - index_a.write_text("", encoding="utf-8") - index_b.write_text("", encoding="utf-8") - - class FakeANNIndex: - def __init__(self, index_path: Path, dim: int) -> None: - source = Path(index_path) - self.root = source if source.name != "_index.db" else source.parent - self.dim = dim - - @classmethod - def create_central(cls, *, index_root: Path, dim: int): - return cls(index_root, dim) - - def load(self) -> bool: - return True - - def count(self) -> int: - return 2 if self.root == root_a else 1 - - def search(self, _query_dense, top_k: int): - if self.root == root_a: - return [1, 3][:top_k], [0.01, 0.011][:top_k] - return [2][:top_k], [0.02][:top_k] - - monkeypatch.setattr( - engine, - "_resolve_dense_embedding_settings", - lambda *, index_root: ( - ("fastembed", "code", False) - if Path(index_root) == root_a - else ("litellm", "qwen3-embedding-sf", False) - ), - ) - monkeypatch.setattr( - engine, - "_embed_dense_query", - lambda _query, *, index_root=None, query_cache=None: ( - np.ones(4, dtype=np.float32) - if Path(index_root) == root_a - else np.ones(8, dtype=np.float32) - ), - ) - monkeypatch.setattr(ann_index_module, "ANNIndex", FakeANNIndex) - - coarse_candidates, used_centralized, using_dense_fallback, stage2_index_root = ( - engine._collect_binary_coarse_candidates( - "route query", - [index_a, index_b], - coarse_k=2, - stats=SearchStats(), - index_root=index_a.parent, - allow_dense_fallback=True, - ) - ) - - assert used_centralized is False - assert using_dense_fallback is True - assert stage2_index_root is None - assert coarse_candidates == [ - (1, 0.01, root_a), - (2, 0.02, root_b), - ] - - -def test_cross_encoder_rerank_deduplicates_duplicate_paths_before_reranking( - monkeypatch: pytest.MonkeyPatch, - temp_paths: Path, -) -> None: - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) - engine = ChainSearchEngine(registry, mapper, config=config) - - captured: dict[str, object] = {} - - monkeypatch.setattr( - "codexlens.semantic.reranker.check_reranker_available", - lambda _backend: (True, None), - ) - monkeypatch.setattr( - "codexlens.semantic.reranker.get_reranker", - lambda **_kwargs: object(), - ) - - def fake_cross_encoder_rerank( - *, - query: str, - results: list[SearchResult], - reranker, - top_k: int = 50, - batch_size: int = 32, - chunk_type_weights=None, - test_file_penalty: float = 0.0, - ) -> list[SearchResult]: - captured["query"] = query - captured["paths"] = [item.path for item in results] - captured["scores"] = [float(item.score) for item in results] - captured["top_k"] = top_k - captured["batch_size"] = batch_size - captured["chunk_type_weights"] = chunk_type_weights - captured["test_file_penalty"] = test_file_penalty - _ = reranker - return results[:top_k] - - monkeypatch.setattr( - "codexlens.search.ranking.cross_encoder_rerank", - fake_cross_encoder_rerank, - ) - - reranked = engine._cross_encoder_rerank( - "semantic auth query", - [ - SearchResult(path="/repo/src/router.py", score=0.91, excerpt="chunk 1"), - SearchResult(path="/repo/src/router.py", score=0.42, excerpt="chunk 2"), - SearchResult(path="/repo/src/config.py", score=0.73, excerpt="chunk 3"), - ], - top_k=5, - ) - - assert captured["query"] == "semantic auth query" - assert captured["paths"] == ["/repo/src/router.py", "/repo/src/config.py"] - assert captured["scores"] == pytest.approx([0.91, 0.73]) - assert captured["top_k"] == 5 - assert len(reranked) == 2 - - -def test_binary_cascade_search_merges_multiple_centralized_roots( - monkeypatch: pytest.MonkeyPatch, - temp_paths: Path, -) -> None: - import sqlite3 - import numpy as np - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=temp_paths / "indexes") - config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) - engine = ChainSearchEngine(registry, mapper, config=config) - - root_a = temp_paths / "indexes" / "project-a" - root_b = temp_paths / "indexes" / "project-b" - source_db_a = root_a / "source-a.db" - source_db_b = root_b / "source-b.db" - - for root, source_db, chunk_id in ((root_a, source_db_a, 1), (root_b, source_db_b, 2)): - root.mkdir(parents=True, exist_ok=True) - (root / BINARY_VECTORS_MMAP_NAME).write_bytes(b"binary") - (root / VECTORS_META_DB_NAME).write_bytes(b"meta") - conn = sqlite3.connect(source_db) - conn.execute("CREATE TABLE semantic_chunks (id INTEGER PRIMARY KEY, embedding_dense BLOB)") - conn.execute( - "INSERT INTO semantic_chunks (id, embedding_dense) VALUES (?, ?)", - (chunk_id, np.ones(4, dtype=np.float32).tobytes()), - ) - conn.commit() - conn.close() - - index_a = root_a / "src" / "_index.db" - index_b = root_b / "src" / "_index.db" - index_a.parent.mkdir(parents=True, exist_ok=True) - index_b.parent.mkdir(parents=True, exist_ok=True) - index_a.write_text("", encoding="utf-8") - index_b.write_text("", encoding="utf-8") - - class FakeBinarySearcher: - def __init__(self, root: Path) -> None: - self.root = root - self.backend = "fastembed" - self.model = None - self.model_profile = "code" - - def search(self, _query_dense, top_k: int): - return [(1, 8)] if self.root == root_a else [(2, 16)] - - class FakeEmbedder: - def embed_to_numpy(self, _queries): - return np.ones((1, 4), dtype=np.float32) - - class FakeVectorMetadataStore: - def __init__(self, path: Path) -> None: - self.path = Path(path) - - def get_chunks_by_ids(self, chunk_ids): - source_db = source_db_a if self.path.parent == root_a else source_db_b - return [ - { - "chunk_id": chunk_id, - "file_path": str(self.path.parent / f"file{chunk_id}.py"), - "content": f"chunk {chunk_id}", - "source_index_db": str(source_db), - } - for chunk_id in chunk_ids - ] - - import codexlens.semantic.embedder as embedder_module - - monkeypatch.setattr(engine, "_find_start_index", lambda _source_path: index_a) - monkeypatch.setattr(engine, "_collect_index_paths", lambda _start_index, _depth: [index_a, index_b]) - monkeypatch.setattr( - engine, - "_get_centralized_binary_searcher", - lambda root: FakeBinarySearcher(root), - ) - monkeypatch.setattr(embedder_module, "get_embedder", lambda **_kwargs: FakeEmbedder()) - monkeypatch.setattr(chain_search_module, "VectorMetadataStore", FakeVectorMetadataStore) - monkeypatch.setattr( - engine, - "_embed_dense_query", - lambda _query, *, index_root=None, query_cache=None: np.ones(4, dtype=np.float32), - ) - monkeypatch.setattr(engine, "search", lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected fallback"))) - - result = engine.binary_cascade_search( - "binary query", - index_a.parent, - k=5, - coarse_k=5, - ) - - assert len(result.results) == 2 - assert {Path(item.path).name for item in result.results} == {"file1.py", "file2.py"} diff --git a/codex-lens/tests/test_cli_help.py b/codex-lens/tests/test_cli_help.py deleted file mode 100644 index dd51f64f..00000000 --- a/codex-lens/tests/test_cli_help.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Smoke tests for CodexLens CLI help output. - -These tests ensure that help text generation does not crash at import time -or during Click/Typer option parsing. -""" - -from __future__ import annotations - -import os -import subprocess -import sys -from pathlib import Path - -from typer.testing import CliRunner - - -def _subprocess_env() -> dict[str, str]: - env = os.environ.copy() - codex_lens_root = Path(__file__).resolve().parents[1] - src_dir = codex_lens_root / "src" - existing = env.get("PYTHONPATH", "") - env["PYTHONPATH"] = str(src_dir) + (os.pathsep + existing if existing else "") - return env - - -def test_python_module_help_does_not_crash() -> None: - proc = subprocess.run( - [sys.executable, "-m", "codexlens", "--help"], - capture_output=True, - text=True, - encoding="utf-8", - errors="replace", - env=_subprocess_env(), - ) - assert proc.returncode == 0, proc.stderr - assert "Traceback" not in (proc.stderr or "") - - -def test_typer_app_help_does_not_crash() -> None: - from codexlens.cli.commands import app - - runner = CliRunner() - result = runner.invoke(app, ["--help"]) - assert result.exit_code == 0, result.output - - -def test_extract_embedding_error_uses_details() -> None: - from codexlens.cli.commands import _extract_embedding_error - - embed_result = { - "success": False, - "result": { - "details": [ - {"index_path": "/tmp/a/_index.db", "success": False, "error": "Backend timeout"}, - {"index_path": "/tmp/b/_index.db", "success": False, "error": "Rate limit"}, - ] - }, - } - msg = _extract_embedding_error(embed_result) - assert "Unknown error" not in msg - assert "Backend timeout" in msg diff --git a/codex-lens/tests/test_cli_hybrid_search.py b/codex-lens/tests/test_cli_hybrid_search.py deleted file mode 100644 index 04e5dc78..00000000 --- a/codex-lens/tests/test_cli_hybrid_search.py +++ /dev/null @@ -1,122 +0,0 @@ -"""Tests for CLI hybrid search integration (T6).""" - -import pytest -from typer.testing import CliRunner -from codexlens.cli.commands import app - - -class TestCLIHybridSearch: - """Test CLI integration for hybrid search modes.""" - - @pytest.fixture - def runner(self): - """Create CLI test runner.""" - return CliRunner() - - def test_search_mode_parameter_validation(self, runner): - """Test --mode parameter accepts valid modes and rejects invalid ones.""" - # Valid modes should pass validation (even if no index exists) - valid_modes = ["exact", "fuzzy", "hybrid", "vector"] - for mode in valid_modes: - result = runner.invoke(app, ["search", "test", "--mode", mode]) - # Should fail due to no index, not due to invalid mode - # Note: CLI now shows deprecation warning for --mode, use --method instead - assert "Invalid" not in result.output or "deprecated" in result.output.lower() - - # Invalid mode should fail - result = runner.invoke(app, ["search", "test", "--mode", "invalid"]) - assert result.exit_code == 1 - # CLI now shows "Invalid deprecated mode:" instead of "Invalid mode" - assert "Invalid" in result.output and "mode" in result.output.lower() - - def test_weights_parameter_parsing(self, runner): - """Test --weights parameter parses and validates correctly.""" - # Valid weights (3 values summing to ~1.0) - result = runner.invoke( - app, ["search", "test", "--mode", "hybrid", "--weights", "0.5,0.3,0.2"] - ) - # Should not show weight warning - assert "Invalid weights" not in result.output - - # Invalid weights (wrong number of values) - result = runner.invoke( - app, ["search", "test", "--mode", "hybrid", "--weights", "0.5,0.5"] - ) - assert "Invalid weights format" in result.output - - # Invalid weights (non-numeric) - result = runner.invoke( - app, ["search", "test", "--mode", "hybrid", "--weights", "a,b,c"] - ) - assert "Invalid weights format" in result.output - - def test_weights_normalization(self, runner): - """Test weights are normalized when they don't sum to 1.0.""" - # Weights summing to 2.0 should trigger normalization warning - result = runner.invoke( - app, ["search", "test", "--mode", "hybrid", "--weights", "0.8,0.6,0.6"] - ) - # Should show normalization warning - if "Normalizing" in result.output or "Warning" in result.output: - # Expected behavior - pass - - def test_search_help_shows_modes(self, runner): - """Test search --help displays all available methods.""" - result = runner.invoke(app, ["search", "--help"]) - assert result.exit_code == 0 - # CLI now uses --method with: dense_rerank, fts, hybrid, cascade - assert "dense_rerank" in result.output or "fts" in result.output - assert "method" in result.output.lower() - - def test_migrate_command_exists(self, runner): - """Test migrate command is registered and accessible.""" - result = runner.invoke(app, ["migrate", "--help"]) - assert result.exit_code == 0 - assert "Dual-FTS upgrade" in result.output - assert "schema version 4" in result.output - - def test_status_command_shows_backends(self, runner): - """Test status command displays search backend availability.""" - result = runner.invoke(app, ["status"]) - # Should show backend status (even if no indexes) - assert "Search Backends" in result.output or result.exit_code == 0 - - -class TestSearchModeMapping: - """Test mode parameter maps correctly to SearchOptions.""" - - @pytest.fixture - def runner(self): - """Create CLI test runner.""" - return CliRunner() - - def test_exact_mode_disables_fuzzy(self, runner): - """Test --mode exact disables fuzzy search.""" - # This would require mocking, but we can verify the parameter is accepted - result = runner.invoke(app, ["search", "test", "--mode", "exact"]) - # Should not show mode validation error - assert "Invalid mode" not in result.output - - def test_fuzzy_mode_enables_only_fuzzy(self, runner): - """Test --mode fuzzy enables fuzzy search only.""" - result = runner.invoke(app, ["search", "test", "--mode", "fuzzy"]) - assert "Invalid mode" not in result.output - - def test_hybrid_mode_enables_both(self, runner): - """Test --mode hybrid enables both exact and fuzzy.""" - result = runner.invoke(app, ["search", "test", "--mode", "hybrid"]) - assert "Invalid mode" not in result.output - - def test_vector_mode_accepted(self, runner): - """Test --mode vector is accepted (future feature).""" - result = runner.invoke(app, ["search", "test", "--mode", "vector"]) - assert "Invalid mode" not in result.output - - -def test_cli_imports_successfully(): - """Test CLI modules import without errors.""" - from codexlens.cli import commands, output - - assert hasattr(commands, "app") - assert hasattr(output, "render_search_results") diff --git a/codex-lens/tests/test_cli_output.py b/codex-lens/tests/test_cli_output.py deleted file mode 100644 index 58d631c3..00000000 --- a/codex-lens/tests/test_cli_output.py +++ /dev/null @@ -1,280 +0,0 @@ -"""Tests for CodexLens CLI output functions.""" - -import json -from dataclasses import dataclass -from io import StringIO -from pathlib import Path -from unittest.mock import patch - -import pytest -from rich.console import Console - -from codexlens.cli.output import ( - _to_jsonable, - print_json, - render_file_inspect, - render_search_results, - render_status, - render_symbols, -) -from codexlens.entities import SearchResult, Symbol - - -class TestToJsonable: - """Tests for _to_jsonable helper function.""" - - def test_none_value(self): - """Test converting None.""" - assert _to_jsonable(None) is None - - def test_primitive_values(self): - """Test converting primitive values.""" - assert _to_jsonable("string") == "string" - assert _to_jsonable(42) == 42 - assert _to_jsonable(3.14) == 3.14 - assert _to_jsonable(True) is True - - def test_path_conversion(self): - """Test converting Path to string.""" - path = Path("/test/file.py") - result = _to_jsonable(path) - assert result == str(path) - - def test_dict_conversion(self): - """Test converting dict with nested values.""" - data = {"key": "value", "path": Path("/test.py"), "nested": {"a": 1}} - result = _to_jsonable(data) - assert result["key"] == "value" - # Path conversion uses str(), which may differ by OS - assert result["path"] == str(Path("/test.py")) - assert result["nested"]["a"] == 1 - - def test_list_conversion(self): - """Test converting list with various items.""" - data = ["string", 42, Path("/test.py")] - result = _to_jsonable(data) - assert result == ["string", 42, str(Path("/test.py"))] - - def test_tuple_conversion(self): - """Test converting tuple.""" - data = ("a", "b", Path("/test.py")) - result = _to_jsonable(data) - assert result == ["a", "b", str(Path("/test.py"))] - - def test_set_conversion(self): - """Test converting set.""" - data = {1, 2, 3} - result = _to_jsonable(data) - assert set(result) == {1, 2, 3} - - def test_pydantic_model_conversion(self): - """Test converting Pydantic model.""" - symbol = Symbol(name="test", kind="function", range=(1, 5)) - result = _to_jsonable(symbol) - assert result["name"] == "test" - assert result["kind"] == "function" - assert result["range"] == (1, 5) - - def test_dataclass_conversion(self): - """Test converting dataclass.""" - @dataclass - class TestData: - name: str - value: int - - data = TestData(name="test", value=42) - result = _to_jsonable(data) - assert result["name"] == "test" - assert result["value"] == 42 - - -class TestPrintJson: - """Tests for print_json function.""" - - def test_print_success_json(self, capsys): - """Test printing success JSON.""" - with patch("codexlens.cli.output.console") as mock_console: - captured_output = [] - mock_console.print_json = lambda x: captured_output.append(x) - - print_json(success=True, result={"key": "value"}) - - output = json.loads(captured_output[0]) - assert output["success"] is True - assert output["result"]["key"] == "value" - - def test_print_error_json(self, capsys): - """Test printing error JSON.""" - with patch("codexlens.cli.output.console") as mock_console: - captured_output = [] - mock_console.print_json = lambda x: captured_output.append(x) - - print_json(success=False, error="Something went wrong") - - output = json.loads(captured_output[0]) - assert output["success"] is False - assert output["error"] == "Something went wrong" - - def test_print_error_default_message(self, capsys): - """Test printing error with default message.""" - with patch("codexlens.cli.output.console") as mock_console: - captured_output = [] - mock_console.print_json = lambda x: captured_output.append(x) - - print_json(success=False) - - output = json.loads(captured_output[0]) - assert output["error"] == "Unknown error" - - -class TestRenderSearchResults: - """Tests for render_search_results function.""" - - def test_render_empty_results(self): - """Test rendering empty results.""" - with patch("codexlens.cli.output.console") as mock_console: - render_search_results([]) - mock_console.print.assert_called_once() - - def test_render_results_with_data(self): - """Test rendering results with data.""" - results = [ - SearchResult(path="/test/a.py", score=0.95, excerpt="test excerpt"), - SearchResult(path="/test/b.py", score=0.85, excerpt="another excerpt"), - ] - - with patch("codexlens.cli.output.console") as mock_console: - render_search_results(results) - mock_console.print.assert_called_once() - - def test_render_results_custom_title(self): - """Test rendering results with custom title.""" - results = [SearchResult(path="/test.py", score=0.5)] - - with patch("codexlens.cli.output.console") as mock_console: - render_search_results(results, title="Custom Title") - mock_console.print.assert_called_once() - - -class TestRenderSymbols: - """Tests for render_symbols function.""" - - def test_render_empty_symbols(self): - """Test rendering empty symbols list.""" - with patch("codexlens.cli.output.console") as mock_console: - render_symbols([]) - mock_console.print.assert_called_once() - - def test_render_symbols_with_data(self): - """Test rendering symbols with data.""" - symbols = [ - Symbol(name="MyClass", kind="class", range=(1, 10)), - Symbol(name="my_func", kind="function", range=(12, 20)), - ] - - with patch("codexlens.cli.output.console") as mock_console: - render_symbols(symbols) - mock_console.print.assert_called_once() - - def test_render_symbols_custom_title(self): - """Test rendering symbols with custom title.""" - symbols = [Symbol(name="test", kind="function", range=(1, 1))] - - with patch("codexlens.cli.output.console") as mock_console: - render_symbols(symbols, title="Functions Found") - mock_console.print.assert_called_once() - - -class TestRenderStatus: - """Tests for render_status function.""" - - def test_render_basic_stats(self): - """Test rendering basic stats.""" - stats = {"files": 100, "symbols": 500} - - with patch("codexlens.cli.output.console") as mock_console: - render_status(stats) - mock_console.print.assert_called_once() - - def test_render_stats_with_nested_dict(self): - """Test rendering stats with nested dict.""" - stats = { - "files": 100, - "languages": {"python": 50, "javascript": 30, "go": 20}, - } - - with patch("codexlens.cli.output.console") as mock_console: - render_status(stats) - mock_console.print.assert_called_once() - - def test_render_stats_with_list(self): - """Test rendering stats with list value.""" - stats = { - "files": 100, - "recent_files": ["/a.py", "/b.py", "/c.py"], - } - - with patch("codexlens.cli.output.console") as mock_console: - render_status(stats) - mock_console.print.assert_called_once() - - -class TestRenderFileInspect: - """Tests for render_file_inspect function.""" - - def test_render_file_with_symbols(self): - """Test rendering file inspection with symbols.""" - symbols = [ - Symbol(name="hello", kind="function", range=(1, 5)), - Symbol(name="MyClass", kind="class", range=(7, 20)), - ] - - with patch("codexlens.cli.output.console") as mock_console: - render_file_inspect("/test/file.py", "python", symbols) - # Should be called twice: once for header, once for symbols table - assert mock_console.print.call_count == 2 - - def test_render_file_without_symbols(self): - """Test rendering file inspection without symbols.""" - with patch("codexlens.cli.output.console") as mock_console: - render_file_inspect("/test/file.py", "python", []) - assert mock_console.print.call_count == 2 - - -class TestJsonOutputIntegration: - """Integration tests for JSON output.""" - - def test_search_result_to_json(self): - """Test converting SearchResult to JSON.""" - result = SearchResult( - path="/test.py", - score=0.95, - excerpt="test code here", - metadata={"line": 10}, - ) - - jsonable = _to_jsonable(result) - # Verify it can be JSON serialized - json_str = json.dumps(jsonable) - parsed = json.loads(json_str) - - assert parsed["path"] == "/test.py" - assert parsed["score"] == 0.95 - assert parsed["excerpt"] == "test code here" - - def test_nested_results_to_json(self): - """Test converting nested structure to JSON.""" - data = { - "query": "test", - "results": [ - SearchResult(path="/a.py", score=0.9), - SearchResult(path="/b.py", score=0.8), - ], - } - - jsonable = _to_jsonable(data) - json_str = json.dumps(jsonable) - parsed = json.loads(json_str) - - assert parsed["query"] == "test" - assert len(parsed["results"]) == 2 diff --git a/codex-lens/tests/test_clustering_strategies.py b/codex-lens/tests/test_clustering_strategies.py deleted file mode 100644 index fc559f5f..00000000 --- a/codex-lens/tests/test_clustering_strategies.py +++ /dev/null @@ -1,786 +0,0 @@ -"""Unit tests for clustering strategies in the hybrid search pipeline. - -Tests cover: -1. HDBSCANStrategy - Primary HDBSCAN clustering -2. DBSCANStrategy - Fallback DBSCAN clustering -3. NoOpStrategy - No-op fallback when clustering unavailable -4. ClusteringStrategyFactory - Factory with fallback chain -""" - -from __future__ import annotations - -from typing import List -from unittest.mock import MagicMock, patch - -import pytest - -from codexlens.entities import SearchResult -from codexlens.search.clustering import ( - BaseClusteringStrategy, - ClusteringConfig, - ClusteringStrategyFactory, - NoOpStrategy, - check_clustering_strategy_available, - get_strategy, -) - - -# ============================================================================= -# Test Fixtures -# ============================================================================= - - -@pytest.fixture -def sample_results() -> List[SearchResult]: - """Create sample search results for testing.""" - return [ - SearchResult(path="a.py", score=0.9, excerpt="def foo(): pass"), - SearchResult(path="b.py", score=0.8, excerpt="def foo(): pass"), - SearchResult(path="c.py", score=0.7, excerpt="def bar(): pass"), - SearchResult(path="d.py", score=0.6, excerpt="def bar(): pass"), - SearchResult(path="e.py", score=0.5, excerpt="def baz(): pass"), - ] - - -@pytest.fixture -def mock_embeddings(): - """Create mock embeddings for 5 results. - - Creates embeddings that should form 2 clusters: - - Results 0, 1 (similar to each other) - - Results 2, 3 (similar to each other) - - Result 4 (noise/singleton) - """ - import numpy as np - - # Create embeddings in 3D for simplicity - return np.array( - [ - [1.0, 0.0, 0.0], # Result 0 - cluster A - [0.9, 0.1, 0.0], # Result 1 - cluster A - [0.0, 1.0, 0.0], # Result 2 - cluster B - [0.1, 0.9, 0.0], # Result 3 - cluster B - [0.0, 0.0, 1.0], # Result 4 - noise/singleton - ], - dtype=np.float32, - ) - - -@pytest.fixture -def default_config() -> ClusteringConfig: - """Create default clustering configuration.""" - return ClusteringConfig( - min_cluster_size=2, - min_samples=1, - metric="euclidean", - ) - - -# ============================================================================= -# Test ClusteringConfig -# ============================================================================= - - -class TestClusteringConfig: - """Tests for ClusteringConfig validation.""" - - def test_default_values(self): - """Test default configuration values.""" - config = ClusteringConfig() - assert config.min_cluster_size == 3 - assert config.min_samples == 2 - assert config.metric == "cosine" - assert config.cluster_selection_epsilon == 0.0 - assert config.allow_single_cluster is True - assert config.prediction_data is False - - def test_custom_values(self): - """Test custom configuration values.""" - config = ClusteringConfig( - min_cluster_size=5, - min_samples=3, - metric="euclidean", - cluster_selection_epsilon=0.1, - allow_single_cluster=False, - prediction_data=True, - ) - assert config.min_cluster_size == 5 - assert config.min_samples == 3 - assert config.metric == "euclidean" - - def test_invalid_min_cluster_size(self): - """Test validation rejects min_cluster_size < 2.""" - with pytest.raises(ValueError, match="min_cluster_size must be >= 2"): - ClusteringConfig(min_cluster_size=1) - - def test_invalid_min_samples(self): - """Test validation rejects min_samples < 1.""" - with pytest.raises(ValueError, match="min_samples must be >= 1"): - ClusteringConfig(min_samples=0) - - def test_invalid_metric(self): - """Test validation rejects invalid metric.""" - with pytest.raises(ValueError, match="metric must be one of"): - ClusteringConfig(metric="invalid") - - def test_invalid_epsilon(self): - """Test validation rejects negative epsilon.""" - with pytest.raises(ValueError, match="cluster_selection_epsilon must be >= 0"): - ClusteringConfig(cluster_selection_epsilon=-0.1) - - -# ============================================================================= -# Test NoOpStrategy -# ============================================================================= - - -class TestNoOpStrategy: - """Tests for NoOpStrategy - always available.""" - - def test_cluster_returns_singleton_clusters( - self, sample_results: List[SearchResult], mock_embeddings - ): - """Test cluster() returns each result as singleton cluster.""" - strategy = NoOpStrategy() - clusters = strategy.cluster(mock_embeddings, sample_results) - - assert len(clusters) == 5 - for i, cluster in enumerate(clusters): - assert cluster == [i] - - def test_cluster_empty_results(self): - """Test cluster() with empty results.""" - import numpy as np - - strategy = NoOpStrategy() - clusters = strategy.cluster(np.array([]), []) - - assert clusters == [] - - def test_select_representatives_returns_all_sorted( - self, sample_results: List[SearchResult] - ): - """Test select_representatives() returns all results sorted by score.""" - strategy = NoOpStrategy() - clusters = [[i] for i in range(len(sample_results))] - representatives = strategy.select_representatives(clusters, sample_results) - - assert len(representatives) == 5 - # Check sorted by score descending - scores = [r.score for r in representatives] - assert scores == sorted(scores, reverse=True) - - def test_select_representatives_empty(self): - """Test select_representatives() with empty input.""" - strategy = NoOpStrategy() - representatives = strategy.select_representatives([], []) - assert representatives == [] - - def test_fit_predict_convenience_method( - self, sample_results: List[SearchResult], mock_embeddings - ): - """Test fit_predict() convenience method.""" - strategy = NoOpStrategy() - representatives = strategy.fit_predict(mock_embeddings, sample_results) - - assert len(representatives) == 5 - # All results returned, sorted by score - assert representatives[0].score >= representatives[-1].score - - -# ============================================================================= -# Test HDBSCANStrategy -# ============================================================================= - - -class TestHDBSCANStrategy: - """Tests for HDBSCANStrategy - requires hdbscan package.""" - - @pytest.fixture - def hdbscan_strategy(self, default_config): - """Create HDBSCANStrategy if available.""" - try: - from codexlens.search.clustering import HDBSCANStrategy - - return HDBSCANStrategy(default_config) - except ImportError: - pytest.skip("hdbscan not installed") - - def test_cluster_returns_list_of_lists( - self, hdbscan_strategy, sample_results: List[SearchResult], mock_embeddings - ): - """Test cluster() returns List[List[int]].""" - clusters = hdbscan_strategy.cluster(mock_embeddings, sample_results) - - assert isinstance(clusters, list) - for cluster in clusters: - assert isinstance(cluster, list) - for idx in cluster: - assert isinstance(idx, int) - assert 0 <= idx < len(sample_results) - - def test_cluster_covers_all_results( - self, hdbscan_strategy, sample_results: List[SearchResult], mock_embeddings - ): - """Test all result indices appear in clusters.""" - clusters = hdbscan_strategy.cluster(mock_embeddings, sample_results) - - all_indices = set() - for cluster in clusters: - all_indices.update(cluster) - - assert all_indices == set(range(len(sample_results))) - - def test_cluster_supports_cosine_metric( - self, sample_results: List[SearchResult], mock_embeddings - ): - """Test HDBSCANStrategy can run with metric='cosine' (via precomputed distances).""" - try: - from codexlens.search.clustering import HDBSCANStrategy - except ImportError: - pytest.skip("hdbscan not installed") - - config = ClusteringConfig(min_cluster_size=2, min_samples=1, metric="cosine") - strategy = HDBSCANStrategy(config) - - clusters = strategy.cluster(mock_embeddings, sample_results) - - all_indices = set() - for cluster in clusters: - all_indices.update(cluster) - - assert all_indices == set(range(len(sample_results))) - - def test_cluster_empty_results(self, hdbscan_strategy): - """Test cluster() with empty results.""" - import numpy as np - - clusters = hdbscan_strategy.cluster(np.array([]).reshape(0, 3), []) - assert clusters == [] - - def test_cluster_single_result(self, hdbscan_strategy): - """Test cluster() with single result.""" - import numpy as np - - result = SearchResult(path="a.py", score=0.9, excerpt="test") - embeddings = np.array([[1.0, 0.0, 0.0]]) - clusters = hdbscan_strategy.cluster(embeddings, [result]) - - assert len(clusters) == 1 - assert clusters[0] == [0] - - def test_cluster_fewer_than_min_cluster_size(self, hdbscan_strategy): - """Test cluster() with fewer results than min_cluster_size.""" - import numpy as np - - # Strategy has min_cluster_size=2, so 1 result returns singleton - result = SearchResult(path="a.py", score=0.9, excerpt="test") - embeddings = np.array([[1.0, 0.0, 0.0]]) - clusters = hdbscan_strategy.cluster(embeddings, [result]) - - assert len(clusters) == 1 - assert clusters[0] == [0] - - def test_select_representatives_picks_highest_score( - self, hdbscan_strategy, sample_results: List[SearchResult], mock_embeddings - ): - """Test select_representatives() picks highest score per cluster.""" - clusters = hdbscan_strategy.cluster(mock_embeddings, sample_results) - representatives = hdbscan_strategy.select_representatives( - clusters, sample_results - ) - - # Each representative should be the highest-scored in its cluster - for rep in representatives: - # Find the cluster containing this representative - rep_idx = next( - i for i, r in enumerate(sample_results) if r.path == rep.path - ) - for cluster in clusters: - if rep_idx in cluster: - cluster_scores = [sample_results[i].score for i in cluster] - assert rep.score == max(cluster_scores) - break - - def test_select_representatives_sorted_by_score( - self, hdbscan_strategy, sample_results: List[SearchResult], mock_embeddings - ): - """Test representatives are sorted by score descending.""" - clusters = hdbscan_strategy.cluster(mock_embeddings, sample_results) - representatives = hdbscan_strategy.select_representatives( - clusters, sample_results - ) - - scores = [r.score for r in representatives] - assert scores == sorted(scores, reverse=True) - - def test_fit_predict_end_to_end( - self, hdbscan_strategy, sample_results: List[SearchResult], mock_embeddings - ): - """Test fit_predict() end-to-end clustering.""" - representatives = hdbscan_strategy.fit_predict(mock_embeddings, sample_results) - - # Should have fewer or equal representatives than input - assert len(representatives) <= len(sample_results) - # All representatives should be from original results - rep_paths = {r.path for r in representatives} - original_paths = {r.path for r in sample_results} - assert rep_paths.issubset(original_paths) - - -# ============================================================================= -# Test DBSCANStrategy -# ============================================================================= - - -class TestDBSCANStrategy: - """Tests for DBSCANStrategy - requires sklearn.""" - - @pytest.fixture - def dbscan_strategy(self, default_config): - """Create DBSCANStrategy if available.""" - try: - from codexlens.search.clustering import DBSCANStrategy - - return DBSCANStrategy(default_config) - except ImportError: - pytest.skip("sklearn not installed") - - def test_cluster_returns_list_of_lists( - self, dbscan_strategy, sample_results: List[SearchResult], mock_embeddings - ): - """Test cluster() returns List[List[int]].""" - clusters = dbscan_strategy.cluster(mock_embeddings, sample_results) - - assert isinstance(clusters, list) - for cluster in clusters: - assert isinstance(cluster, list) - for idx in cluster: - assert isinstance(idx, int) - assert 0 <= idx < len(sample_results) - - def test_cluster_covers_all_results( - self, dbscan_strategy, sample_results: List[SearchResult], mock_embeddings - ): - """Test all result indices appear in clusters.""" - clusters = dbscan_strategy.cluster(mock_embeddings, sample_results) - - all_indices = set() - for cluster in clusters: - all_indices.update(cluster) - - assert all_indices == set(range(len(sample_results))) - - def test_cluster_empty_results(self, dbscan_strategy): - """Test cluster() with empty results.""" - import numpy as np - - clusters = dbscan_strategy.cluster(np.array([]).reshape(0, 3), []) - assert clusters == [] - - def test_cluster_single_result(self, dbscan_strategy): - """Test cluster() with single result.""" - import numpy as np - - result = SearchResult(path="a.py", score=0.9, excerpt="test") - embeddings = np.array([[1.0, 0.0, 0.0]]) - clusters = dbscan_strategy.cluster(embeddings, [result]) - - assert len(clusters) == 1 - assert clusters[0] == [0] - - def test_cluster_with_explicit_eps(self, default_config): - """Test cluster() with explicit eps parameter.""" - try: - from codexlens.search.clustering import DBSCANStrategy - except ImportError: - pytest.skip("sklearn not installed") - - import numpy as np - - strategy = DBSCANStrategy(default_config, eps=0.5) - results = [SearchResult(path=f"{i}.py", score=0.5, excerpt="test") for i in range(3)] - embeddings = np.array([[0.0, 0.0], [0.1, 0.0], [1.0, 1.0]]) - - clusters = strategy.cluster(embeddings, results) - # With eps=0.5, first two should cluster, third should be separate - assert len(clusters) >= 2 - - def test_auto_compute_eps(self, dbscan_strategy, mock_embeddings): - """Test eps auto-computation from distance distribution.""" - # Should not raise - eps is computed automatically - results = [SearchResult(path=f"{i}.py", score=0.5, excerpt="test") for i in range(5)] - clusters = dbscan_strategy.cluster(mock_embeddings, results) - assert len(clusters) > 0 - - def test_select_representatives_picks_highest_score( - self, dbscan_strategy, sample_results: List[SearchResult], mock_embeddings - ): - """Test select_representatives() picks highest score per cluster.""" - clusters = dbscan_strategy.cluster(mock_embeddings, sample_results) - representatives = dbscan_strategy.select_representatives( - clusters, sample_results - ) - - # Each representative should be the highest-scored in its cluster - for rep in representatives: - rep_idx = next( - i for i, r in enumerate(sample_results) if r.path == rep.path - ) - for cluster in clusters: - if rep_idx in cluster: - cluster_scores = [sample_results[i].score for i in cluster] - assert rep.score == max(cluster_scores) - break - - def test_select_representatives_sorted_by_score( - self, dbscan_strategy, sample_results: List[SearchResult], mock_embeddings - ): - """Test representatives are sorted by score descending.""" - clusters = dbscan_strategy.cluster(mock_embeddings, sample_results) - representatives = dbscan_strategy.select_representatives( - clusters, sample_results - ) - - scores = [r.score for r in representatives] - assert scores == sorted(scores, reverse=True) - - -# ============================================================================= -# Test ClusteringStrategyFactory -# ============================================================================= - - -class TestClusteringStrategyFactory: - """Tests for ClusteringStrategyFactory.""" - - def test_check_noop_always_available(self): - """Test noop strategy is always available.""" - ok, err = check_clustering_strategy_available("noop") - assert ok is True - assert err is None - - def test_check_invalid_strategy(self): - """Test invalid strategy name returns error.""" - ok, err = check_clustering_strategy_available("invalid") - assert ok is False - assert "Invalid clustering strategy" in err - - def test_get_strategy_noop(self, default_config): - """Test get_strategy('noop') returns NoOpStrategy.""" - strategy = get_strategy("noop", default_config) - assert isinstance(strategy, NoOpStrategy) - - def test_get_strategy_auto_returns_something(self, default_config): - """Test get_strategy('auto') returns a strategy.""" - strategy = get_strategy("auto", default_config) - assert isinstance(strategy, BaseClusteringStrategy) - - def test_get_strategy_with_fallback_enabled(self, default_config): - """Test fallback when primary strategy unavailable.""" - # Mock hdbscan unavailable - with patch.dict("sys.modules", {"hdbscan": None}): - # Should fall back to dbscan or noop - strategy = get_strategy("hdbscan", default_config, fallback=True) - assert isinstance(strategy, BaseClusteringStrategy) - - def test_get_strategy_fallback_disabled_raises(self, default_config): - """Test ImportError when fallback disabled and strategy unavailable.""" - with patch( - "codexlens.search.clustering.factory.check_clustering_strategy_available" - ) as mock_check: - mock_check.return_value = (False, "Test error") - - with pytest.raises(ImportError, match="Test error"): - get_strategy("hdbscan", default_config, fallback=False) - - def test_get_strategy_invalid_raises(self, default_config): - """Test ValueError for invalid strategy name.""" - with pytest.raises(ValueError, match="Unknown clustering strategy"): - get_strategy("invalid", default_config) - - def test_factory_class_interface(self, default_config): - """Test ClusteringStrategyFactory class interface.""" - strategy = ClusteringStrategyFactory.get_strategy("noop", default_config) - assert isinstance(strategy, NoOpStrategy) - - ok, err = ClusteringStrategyFactory.check_available("noop") - assert ok is True - - @pytest.mark.skipif( - not check_clustering_strategy_available("hdbscan")[0], - reason="hdbscan not installed", - ) - def test_get_strategy_hdbscan(self, default_config): - """Test get_strategy('hdbscan') returns HDBSCANStrategy.""" - from codexlens.search.clustering import HDBSCANStrategy - - strategy = get_strategy("hdbscan", default_config) - assert isinstance(strategy, HDBSCANStrategy) - - @pytest.mark.skipif( - not check_clustering_strategy_available("dbscan")[0], - reason="sklearn not installed", - ) - def test_get_strategy_dbscan(self, default_config): - """Test get_strategy('dbscan') returns DBSCANStrategy.""" - from codexlens.search.clustering import DBSCANStrategy - - strategy = get_strategy("dbscan", default_config) - assert isinstance(strategy, DBSCANStrategy) - - @pytest.mark.skipif( - not check_clustering_strategy_available("dbscan")[0], - reason="sklearn not installed", - ) - def test_get_strategy_dbscan_with_kwargs(self, default_config): - """Test DBSCANStrategy kwargs passed through factory.""" - strategy = get_strategy("dbscan", default_config, eps=0.3, eps_percentile=20.0) - assert strategy.eps == 0.3 - assert strategy.eps_percentile == 20.0 - - -# ============================================================================= -# Integration Tests -# ============================================================================= - - -class TestClusteringIntegration: - """Integration tests for clustering strategies.""" - - def test_all_strategies_same_interface( - self, sample_results: List[SearchResult], mock_embeddings, default_config - ): - """Test all strategies have consistent interface.""" - strategies = [NoOpStrategy(default_config)] - - # Add available strategies - try: - from codexlens.search.clustering import HDBSCANStrategy - - strategies.append(HDBSCANStrategy(default_config)) - except ImportError: - pass - - try: - from codexlens.search.clustering import DBSCANStrategy - - strategies.append(DBSCANStrategy(default_config)) - except ImportError: - pass - - for strategy in strategies: - # All should implement cluster() - clusters = strategy.cluster(mock_embeddings, sample_results) - assert isinstance(clusters, list) - - # All should implement select_representatives() - reps = strategy.select_representatives(clusters, sample_results) - assert isinstance(reps, list) - assert all(isinstance(r, SearchResult) for r in reps) - - # All should implement fit_predict() - reps = strategy.fit_predict(mock_embeddings, sample_results) - assert isinstance(reps, list) - - def test_clustering_reduces_redundancy( - self, default_config - ): - """Test clustering reduces redundant similar results.""" - import numpy as np - - # Create results with very similar embeddings - results = [ - SearchResult(path=f"{i}.py", score=0.9 - i * 0.01, excerpt="def foo(): pass") - for i in range(10) - ] - # Very similar embeddings - should cluster together - embeddings = np.array( - [[1.0 + i * 0.01, 0.0, 0.0] for i in range(10)], dtype=np.float32 - ) - - strategy = get_strategy("auto", default_config) - representatives = strategy.fit_predict(embeddings, results) - - # Should have fewer representatives than input (clustering reduced redundancy) - # NoOp returns all, but HDBSCAN/DBSCAN should reduce - assert len(representatives) <= len(results) - - -# ============================================================================= -# Test FrequencyStrategy -# ============================================================================= - - -class TestFrequencyStrategy: - """Tests for FrequencyStrategy - frequency-based clustering.""" - - @pytest.fixture - def frequency_config(self): - """Create FrequencyConfig for testing.""" - from codexlens.search.clustering import FrequencyConfig - return FrequencyConfig(min_frequency=1, max_representatives_per_group=3) - - @pytest.fixture - def frequency_strategy(self, frequency_config): - """Create FrequencyStrategy instance.""" - from codexlens.search.clustering import FrequencyStrategy - return FrequencyStrategy(frequency_config) - - @pytest.fixture - def symbol_results(self) -> List[SearchResult]: - """Create sample results with symbol names for frequency testing.""" - return [ - SearchResult(path="auth.py", score=0.9, excerpt="authenticate user", symbol_name="authenticate"), - SearchResult(path="login.py", score=0.85, excerpt="authenticate login", symbol_name="authenticate"), - SearchResult(path="session.py", score=0.8, excerpt="authenticate session", symbol_name="authenticate"), - SearchResult(path="utils.py", score=0.7, excerpt="helper function", symbol_name="helper_func"), - SearchResult(path="validate.py", score=0.6, excerpt="validate input", symbol_name="validate"), - SearchResult(path="check.py", score=0.55, excerpt="validate data", symbol_name="validate"), - ] - - def test_frequency_strategy_available(self): - """Test FrequencyStrategy is always available (no deps).""" - ok, err = check_clustering_strategy_available("frequency") - assert ok is True - assert err is None - - def test_get_strategy_frequency(self): - """Test get_strategy('frequency') returns FrequencyStrategy.""" - from codexlens.search.clustering import FrequencyStrategy - strategy = get_strategy("frequency") - assert isinstance(strategy, FrequencyStrategy) - - def test_cluster_groups_by_symbol(self, frequency_strategy, symbol_results): - """Test cluster() groups results by symbol name.""" - import numpy as np - embeddings = np.random.rand(len(symbol_results), 128) - - clusters = frequency_strategy.cluster(embeddings, symbol_results) - - # Should have 3 groups: authenticate(3), validate(2), helper_func(1) - assert len(clusters) == 3 - - # First cluster should be authenticate (highest frequency) - first_cluster_symbols = [symbol_results[i].symbol_name for i in clusters[0]] - assert all(s == "authenticate" for s in first_cluster_symbols) - assert len(clusters[0]) == 3 - - def test_cluster_orders_by_frequency(self, frequency_strategy, symbol_results): - """Test clusters are ordered by frequency (descending).""" - import numpy as np - embeddings = np.random.rand(len(symbol_results), 128) - - clusters = frequency_strategy.cluster(embeddings, symbol_results) - - # Verify frequency ordering - frequencies = [len(c) for c in clusters] - assert frequencies == sorted(frequencies, reverse=True) - - def test_select_representatives_adds_frequency_metadata(self, frequency_strategy, symbol_results): - """Test representatives have frequency metadata.""" - import numpy as np - embeddings = np.random.rand(len(symbol_results), 128) - - clusters = frequency_strategy.cluster(embeddings, symbol_results) - reps = frequency_strategy.select_representatives(clusters, symbol_results, embeddings) - - # Check frequency metadata - for rep in reps: - assert "frequency" in rep.metadata - assert rep.metadata["frequency"] >= 1 - - def test_min_frequency_filter_mode(self, symbol_results): - """Test min_frequency with filter mode removes low-frequency results.""" - from codexlens.search.clustering import FrequencyStrategy, FrequencyConfig - import numpy as np - - config = FrequencyConfig(min_frequency=2, keep_mode="filter") - strategy = FrequencyStrategy(config) - embeddings = np.random.rand(len(symbol_results), 128) - - reps = strategy.fit_predict(embeddings, symbol_results) - - # helper_func (freq=1) should be filtered out - rep_symbols = [r.symbol_name for r in reps] - assert "helper_func" not in rep_symbols - assert "authenticate" in rep_symbols - assert "validate" in rep_symbols - - def test_min_frequency_demote_mode(self, symbol_results): - """Test min_frequency with demote mode keeps but deprioritizes low-frequency.""" - from codexlens.search.clustering import FrequencyStrategy, FrequencyConfig - import numpy as np - - config = FrequencyConfig(min_frequency=2, keep_mode="demote") - strategy = FrequencyStrategy(config) - embeddings = np.random.rand(len(symbol_results), 128) - - reps = strategy.fit_predict(embeddings, symbol_results) - - # helper_func should still be present but at the end - rep_symbols = [r.symbol_name for r in reps] - assert "helper_func" in rep_symbols - # Should be demoted to end - helper_idx = rep_symbols.index("helper_func") - assert helper_idx == len(rep_symbols) - 1 - - def test_group_by_file(self, symbol_results): - """Test grouping by file path instead of symbol.""" - from codexlens.search.clustering import FrequencyStrategy, FrequencyConfig - import numpy as np - - config = FrequencyConfig(group_by="file") - strategy = FrequencyStrategy(config) - embeddings = np.random.rand(len(symbol_results), 128) - - clusters = strategy.cluster(embeddings, symbol_results) - - # Each file should be its own group (all unique paths) - assert len(clusters) == 6 - - def test_max_representatives_per_group(self, symbol_results): - """Test max_representatives_per_group limits output per symbol.""" - from codexlens.search.clustering import FrequencyStrategy, FrequencyConfig - import numpy as np - - config = FrequencyConfig(max_representatives_per_group=1) - strategy = FrequencyStrategy(config) - embeddings = np.random.rand(len(symbol_results), 128) - - reps = strategy.fit_predict(embeddings, symbol_results) - - # Should have at most 1 per group = 3 groups = 3 reps - assert len(reps) == 3 - - def test_frequency_boost_score(self, symbol_results): - """Test frequency_weight boosts high-frequency results.""" - from codexlens.search.clustering import FrequencyStrategy, FrequencyConfig - import numpy as np - - config = FrequencyConfig(frequency_weight=0.5) # Strong boost - strategy = FrequencyStrategy(config) - embeddings = np.random.rand(len(symbol_results), 128) - - reps = strategy.fit_predict(embeddings, symbol_results) - - # High-frequency results should have boosted scores in metadata - for rep in reps: - if rep.metadata.get("frequency", 1) > 1: - assert rep.metadata.get("frequency_boosted_score", 0) > rep.score - - def test_empty_results(self, frequency_strategy): - """Test handling of empty results.""" - import numpy as np - - clusters = frequency_strategy.cluster(np.array([]).reshape(0, 128), []) - assert clusters == [] - - reps = frequency_strategy.select_representatives([], [], None) - assert reps == [] - - def test_factory_with_kwargs(self): - """Test factory passes kwargs to FrequencyConfig.""" - strategy = get_strategy("frequency", min_frequency=3, group_by="file") - assert strategy.config.min_frequency == 3 - assert strategy.config.group_by == "file" diff --git a/codex-lens/tests/test_code_extractor.py b/codex-lens/tests/test_code_extractor.py deleted file mode 100644 index bb83279f..00000000 --- a/codex-lens/tests/test_code_extractor.py +++ /dev/null @@ -1,342 +0,0 @@ -"""Tests for code extractor functionality.""" - -import tempfile -from pathlib import Path - -import pytest - -from codexlens.entities import SearchResult, Symbol -from codexlens.semantic.code_extractor import ( - CodeBlockResult, - extract_complete_code_block, - extract_symbol_with_context, - format_search_result_code, - get_code_block_summary, - enhance_search_results, -) - - -class TestExtractCompleteCodeBlock: - """Test extract_complete_code_block function.""" - - def test_returns_stored_content(self): - """Test returns content when available in result.""" - result = SearchResult( - path="/test.py", - score=0.9, - content="def hello():\n return 'world'", - start_line=1, - end_line=2, - ) - - code = extract_complete_code_block(result) - assert code == "def hello():\n return 'world'" - - def test_reads_from_file_when_no_content(self, tmp_path): - """Test reads from file when content not in result.""" - test_file = tmp_path / "test.py" - test_file.write_text("""# Header comment -def hello(): - '''Docstring''' - return 'world' - -def goodbye(): - pass -""") - - result = SearchResult( - path=str(test_file), - score=0.9, - excerpt="def hello():", - start_line=2, - end_line=4, - ) - - code = extract_complete_code_block(result) - assert "def hello():" in code - assert "return 'world'" in code - - def test_adds_context_lines(self, tmp_path): - """Test adding context lines.""" - test_file = tmp_path / "test.py" - test_file.write_text("""# Line 1 -# Line 2 -def hello(): - return 'world' -# Line 5 -# Line 6 -""") - - result = SearchResult( - path=str(test_file), - score=0.9, - start_line=3, - end_line=4, - ) - - code = extract_complete_code_block(result, context_lines=1) - assert "# Line 2" in code - assert "# Line 5" in code - - -class TestExtractSymbolWithContext: - """Test extract_symbol_with_context function.""" - - def test_extracts_with_decorators(self, tmp_path): - """Test extracting symbol with decorators.""" - test_file = tmp_path / "test.py" - # Line 1: @decorator - # Line 2: @another_decorator - # Line 3: def hello(): - # Line 4: return 'world' - test_file.write_text("@decorator\n@another_decorator\ndef hello():\n return 'world'\n") - - symbol = Symbol(name="hello", kind="function", range=(3, 4)) - code = extract_symbol_with_context(str(test_file), symbol) - - assert "@decorator" in code - assert "@another_decorator" in code - assert "def hello():" in code - - -class TestFormatSearchResultCode: - """Test format_search_result_code function.""" - - def test_format_with_line_numbers(self): - """Test formatting with line numbers.""" - result = SearchResult( - path="/test.py", - score=0.9, - content="def hello():\n return 'world'", - start_line=10, - end_line=11, - ) - - formatted = format_search_result_code(result, show_line_numbers=True) - assert " 10 |" in formatted - assert " 11 |" in formatted - - def test_format_truncation(self): - """Test max_lines truncation.""" - result = SearchResult( - path="/test.py", - score=0.9, - content="line1\nline2\nline3\nline4\nline5", - start_line=1, - end_line=5, - ) - - formatted = format_search_result_code(result, max_lines=2) - assert "(truncated)" in formatted - - def test_format_without_line_numbers(self): - """Test formatting without line numbers.""" - result = SearchResult( - path="/test.py", - score=0.9, - content="def hello():\n pass", - start_line=1, - end_line=2, - ) - - formatted = format_search_result_code(result, show_line_numbers=False) - assert "def hello():" in formatted - assert " | " not in formatted - - -class TestGetCodeBlockSummary: - """Test get_code_block_summary function.""" - - def test_summary_with_symbol(self): - """Test summary with symbol info.""" - result = SearchResult( - path="/test.py", - score=0.9, - symbol_name="hello", - symbol_kind="function", - start_line=10, - end_line=20, - ) - - summary = get_code_block_summary(result) - assert "function" in summary - assert "hello" in summary - assert "10-20" in summary - assert "test.py" in summary - - def test_summary_single_line(self): - """Test summary for single line.""" - result = SearchResult( - path="/test.py", - score=0.9, - start_line=5, - end_line=5, - ) - - summary = get_code_block_summary(result) - assert "line 5" in summary - - -class TestCodeBlockResult: - """Test CodeBlockResult class.""" - - def test_properties(self): - """Test CodeBlockResult properties.""" - result = SearchResult( - path="/path/to/test.py", - score=0.85, - content="def hello(): pass", - symbol_name="hello", - symbol_kind="function", - start_line=1, - end_line=1, - ) - - block = CodeBlockResult(result) - - assert block.score == 0.85 - assert block.path == "/path/to/test.py" - assert block.file_name == "test.py" - assert block.symbol_name == "hello" - assert block.symbol_kind == "function" - assert block.line_range == (1, 1) - assert block.full_code == "def hello(): pass" - - def test_summary(self): - """Test CodeBlockResult summary.""" - result = SearchResult( - path="/test.py", - score=0.9, - symbol_name="Calculator", - symbol_kind="class", - start_line=10, - end_line=50, - ) - - block = CodeBlockResult(result) - summary = block.summary - - assert "class" in summary - assert "Calculator" in summary - - def test_format(self): - """Test CodeBlockResult format.""" - result = SearchResult( - path="/test.py", - score=0.9, - content="def hello():\n return 42", - start_line=1, - end_line=2, - ) - - block = CodeBlockResult(result) - formatted = block.format(show_line_numbers=True) - - assert " 1 |" in formatted - assert "def hello():" in formatted - - -class TestEnhanceSearchResults: - """Test enhance_search_results function.""" - - def test_enhances_results(self): - """Test enhancing search results.""" - results = [ - SearchResult(path="/a.py", score=0.9, content="def a(): pass"), - SearchResult(path="/b.py", score=0.8, content="def b(): pass"), - ] - - enhanced = enhance_search_results(results) - - assert len(enhanced) == 2 - assert all(isinstance(r, CodeBlockResult) for r in enhanced) - assert enhanced[0].score == 0.9 - assert enhanced[1].score == 0.8 - - -class TestIntegration: - """Integration tests for code extraction.""" - - def test_full_workflow(self, tmp_path): - """Test complete code extraction workflow.""" - # Create test file - test_file = tmp_path / "calculator.py" - test_file.write_text('''"""Calculator module.""" - -@staticmethod -def add(a: int, b: int) -> int: - """Add two numbers. - - Args: - a: First number - b: Second number - - Returns: - Sum of a and b - """ - return a + b - -class Calculator: - """A simple calculator.""" - - def __init__(self): - self.result = 0 - - def compute(self, operation: str, value: int) -> int: - """Perform computation.""" - if operation == "add": - self.result += value - elif operation == "sub": - self.result -= value - return self.result -''') - - # Simulate search result for 'add' function - result = SearchResult( - path=str(test_file), - score=0.92, - content='''@staticmethod -def add(a: int, b: int) -> int: - """Add two numbers. - - Args: - a: First number - b: Second number - - Returns: - Sum of a and b - """ - return a + b''', - symbol_name="add", - symbol_kind="function", - start_line=3, - end_line=14, - ) - - block = CodeBlockResult(result) - - # Test properties - assert block.symbol_name == "add" - assert block.symbol_kind == "function" - assert block.line_range == (3, 14) - - # Test full code - assert "@staticmethod" in block.full_code - assert "def add(" in block.full_code - assert "return a + b" in block.full_code - - # Test summary - summary = block.summary - assert "function" in summary - assert "add" in summary - - # Test format - formatted = block.format(show_line_numbers=True) - assert " 3 |" in formatted or "3 |" in formatted - - print("\n--- Full Code Block ---") - print(block.full_code) - print("\n--- Formatted Output ---") - print(formatted) - print("\n--- Summary ---") - print(summary) diff --git a/codex-lens/tests/test_compare_ccw_smart_search_stage2.py b/codex-lens/tests/test_compare_ccw_smart_search_stage2.py deleted file mode 100644 index 901d1cd9..00000000 --- a/codex-lens/tests/test_compare_ccw_smart_search_stage2.py +++ /dev/null @@ -1,350 +0,0 @@ -from __future__ import annotations - -import importlib.util -import json -import sys -from pathlib import Path -from types import SimpleNamespace - - -MODULE_PATH = Path(__file__).resolve().parents[1] / "benchmarks" / "compare_ccw_smart_search_stage2.py" -MODULE_NAME = "compare_ccw_smart_search_stage2_test_module" -MODULE_SPEC = importlib.util.spec_from_file_location(MODULE_NAME, MODULE_PATH) -assert MODULE_SPEC is not None and MODULE_SPEC.loader is not None -benchmark = importlib.util.module_from_spec(MODULE_SPEC) -sys.modules[MODULE_NAME] = benchmark -MODULE_SPEC.loader.exec_module(benchmark) - - -class _FakeChainResult: - def __init__(self, paths: list[str]) -> None: - self.results = [SimpleNamespace(path=path) for path in paths] - - -class _FakeEngine: - def __init__( - self, - *, - search_paths: list[str] | None = None, - cascade_paths: list[str] | None = None, - ) -> None: - self.search_paths = search_paths or [] - self.cascade_paths = cascade_paths or [] - self.search_calls: list[dict[str, object]] = [] - self.cascade_calls: list[dict[str, object]] = [] - - def search(self, query: str, source_path: Path, options: object) -> _FakeChainResult: - self.search_calls.append( - { - "query": query, - "source_path": source_path, - "options": options, - } - ) - return _FakeChainResult(self.search_paths) - - def cascade_search( - self, - query: str, - source_path: Path, - *, - k: int, - coarse_k: int, - options: object, - strategy: str, - ) -> _FakeChainResult: - self.cascade_calls.append( - { - "query": query, - "source_path": source_path, - "k": k, - "coarse_k": coarse_k, - "options": options, - "strategy": strategy, - } - ) - return _FakeChainResult(self.cascade_paths) - - -def test_strategy_specs_include_baselines_before_stage2_modes() -> None: - specs = benchmark._strategy_specs( - ["realtime", "static_global_graph"], - include_dense_baseline=True, - baseline_methods=["auto", "fts", "hybrid"], - ) - - assert [spec.strategy_key for spec in specs] == [ - "auto", - "fts", - "hybrid", - "dense_rerank", - "staged:realtime", - "staged:static_global_graph", - ] - - -def test_select_effective_method_matches_cli_auto_routing() -> None: - assert benchmark._select_effective_method("find_descendant_project_roots", "auto") == "fts" - assert benchmark._select_effective_method("build dist artifact output", "auto") == "fts" - assert benchmark._select_effective_method("embedding backend fastembed local litellm api config", "auto") == "fts" - assert benchmark._select_effective_method("get_reranker factory onnx backend selection", "auto") == "fts" - assert benchmark._select_effective_method("how does the authentication flow work", "auto") == "dense_rerank" - assert benchmark._select_effective_method("how smart_search keyword routing works", "auto") == "hybrid" - - -def test_filter_dataset_by_query_match_uses_case_insensitive_substring() -> None: - dataset = [ - {"query": "embedding backend fastembed local litellm api config", "relevant_paths": ["a"]}, - {"query": "get_reranker factory onnx backend selection", "relevant_paths": ["b"]}, - {"query": "how does smart search route keyword queries", "relevant_paths": ["c"]}, - ] - - filtered = benchmark._filter_dataset_by_query_match(dataset, "BACKEND") - assert [item["query"] for item in filtered] == [ - "embedding backend fastembed local litellm api config", - "get_reranker factory onnx backend selection", - ] - - narrow_filtered = benchmark._filter_dataset_by_query_match(dataset, "FASTEMBED") - assert [item["query"] for item in narrow_filtered] == [ - "embedding backend fastembed local litellm api config", - ] - - unfiltered = benchmark._filter_dataset_by_query_match(dataset, None) - assert [item["query"] for item in unfiltered] == [item["query"] for item in dataset] - - -def test_apply_query_limit_runs_after_filtering() -> None: - dataset = [ - {"query": "executeHybridMode dense_rerank semantic smart_search", "relevant_paths": ["a"]}, - {"query": "embedding backend fastembed local litellm api config", "relevant_paths": ["b"]}, - {"query": "reranker backend onnx api legacy configuration", "relevant_paths": ["c"]}, - ] - - filtered = benchmark._filter_dataset_by_query_match(dataset, "backend") - limited = benchmark._apply_query_limit(filtered, 1) - - assert [item["query"] for item in limited] == [ - "embedding backend fastembed local litellm api config", - ] - - -def test_make_progress_payload_reports_partial_completion() -> None: - args = SimpleNamespace( - queries_file=Path("queries.jsonl"), - k=10, - coarse_k=100, - ) - strategy_specs = [ - benchmark.StrategySpec(strategy_key="auto", strategy="auto", stage2_mode=None), - benchmark.StrategySpec(strategy_key="dense_rerank", strategy="dense_rerank", stage2_mode=None), - ] - evaluations = [ - benchmark.QueryEvaluation( - query="embedding backend fastembed local litellm api config", - intent="config", - notes=None, - relevant_paths=["codex-lens/src/codexlens/config.py"], - runs={ - "auto": benchmark.StrategyRun( - strategy_key="auto", - strategy="auto", - stage2_mode=None, - effective_method="fts", - execution_method="fts", - latency_ms=123.0, - topk_paths=["config.py"], - first_hit_rank=1, - hit_at_k=True, - recall_at_k=1.0, - generated_artifact_count=0, - test_file_count=0, - error=None, - ) - }, - ) - ] - - payload = benchmark._make_progress_payload( - args=args, - source_root=Path("D:/repo"), - strategy_specs=strategy_specs, - evaluations=evaluations, - query_index=1, - total_queries=3, - run_index=2, - total_runs=6, - current_query="embedding backend fastembed local litellm api config", - current_strategy_key="complete", - ) - - assert payload["status"] == "running" - assert payload["progress"]["completed_queries"] == 1 - assert payload["progress"]["completed_runs"] == 2 - assert payload["progress"]["total_runs"] == 6 - assert payload["strategy_keys"] == ["auto", "dense_rerank"] - assert payload["evaluations"][0]["runs"]["auto"]["effective_method"] == "fts" - - -def test_write_final_outputs_updates_progress_snapshot(tmp_path: Path) -> None: - output_path = tmp_path / "results.json" - progress_path = tmp_path / "progress.json" - payload = { - "status": "completed", - "query_count": 1, - "strategies": {"auto": {"effective_methods": {"fts": 1}}}, - } - - benchmark._write_final_outputs( - output_path=output_path, - progress_output=progress_path, - payload=payload, - ) - - assert json.loads(output_path.read_text(encoding="utf-8")) == payload - assert json.loads(progress_path.read_text(encoding="utf-8")) == payload - - -def test_build_parser_defaults_reranker_gpu_to_disabled() -> None: - parser = benchmark.build_parser() - args = parser.parse_args([]) - - assert args.embedding_use_gpu is False - assert args.reranker_use_gpu is False - assert args.reranker_model == benchmark.DEFAULT_LOCAL_ONNX_RERANKER_MODEL - - -def test_build_strategy_runtime_clones_config(monkeypatch, tmp_path: Path) -> None: - class _FakeRegistry: - def __init__(self) -> None: - self.initialized = False - - def initialize(self) -> None: - self.initialized = True - - class _FakeMapper: - pass - - class _FakeEngine: - def __init__(self, *, registry, mapper, config) -> None: - self.registry = registry - self.mapper = mapper - self.config = config - - monkeypatch.setattr(benchmark, "RegistryStore", _FakeRegistry) - monkeypatch.setattr(benchmark, "PathMapper", _FakeMapper) - monkeypatch.setattr(benchmark, "ChainSearchEngine", _FakeEngine) - - base_config = benchmark.Config(data_dir=tmp_path, reranker_use_gpu=False) - strategy_spec = benchmark.StrategySpec(strategy_key="dense_rerank", strategy="dense_rerank", stage2_mode=None) - - runtime = benchmark._build_strategy_runtime(base_config, strategy_spec) - - assert runtime.strategy_spec == strategy_spec - assert runtime.config is not base_config - assert runtime.config.reranker_use_gpu is False - assert runtime.registry.initialized is True - assert runtime.engine.config is runtime.config - - -def test_run_strategy_routes_auto_keyword_queries_to_fts_search() -> None: - engine = _FakeEngine( - search_paths=[ - "D:/repo/src/codexlens/storage/registry.py", - "D:/repo/build/lib/codexlens/storage/registry.py", - ] - ) - config = SimpleNamespace(cascade_strategy="staged", staged_stage2_mode="realtime") - relevant = {benchmark._normalize_path_key("D:/repo/src/codexlens/storage/registry.py")} - - run = benchmark._run_strategy( - engine, - config, - strategy_spec=benchmark.StrategySpec(strategy_key="auto", strategy="auto", stage2_mode=None), - query="find_descendant_project_roots", - source_path=Path("D:/repo"), - k=5, - coarse_k=20, - relevant=relevant, - ) - - assert len(engine.search_calls) == 1 - assert len(engine.cascade_calls) == 0 - assert run.effective_method == "fts" - assert run.execution_method == "fts" - assert run.hit_at_k is True - assert run.generated_artifact_count == 1 - assert run.test_file_count == 0 - - -def test_run_strategy_uses_cascade_for_dense_rerank_and_restores_config() -> None: - engine = _FakeEngine(cascade_paths=["D:/repo/src/tools/smart-search.ts"]) - config = SimpleNamespace(cascade_strategy="staged", staged_stage2_mode="static_global_graph") - relevant = {benchmark._normalize_path_key("D:/repo/src/tools/smart-search.ts")} - - run = benchmark._run_strategy( - engine, - config, - strategy_spec=benchmark.StrategySpec( - strategy_key="dense_rerank", - strategy="dense_rerank", - stage2_mode=None, - ), - query="how does smart search route keyword queries", - source_path=Path("D:/repo"), - k=5, - coarse_k=20, - relevant=relevant, - ) - - assert len(engine.search_calls) == 0 - assert len(engine.cascade_calls) == 1 - assert engine.cascade_calls[0]["strategy"] == "dense_rerank" - assert run.effective_method == "dense_rerank" - assert run.execution_method == "cascade" - assert run.hit_at_k is True - assert config.cascade_strategy == "staged" - assert config.staged_stage2_mode == "static_global_graph" - - -def test_summarize_runs_tracks_effective_method_and_artifact_pressure() -> None: - summary = benchmark._summarize_runs( - [ - benchmark.StrategyRun( - strategy_key="auto", - strategy="auto", - stage2_mode=None, - effective_method="fts", - execution_method="fts", - latency_ms=10.0, - topk_paths=["a"], - first_hit_rank=1, - hit_at_k=True, - recall_at_k=1.0, - generated_artifact_count=1, - test_file_count=0, - error=None, - ), - benchmark.StrategyRun( - strategy_key="auto", - strategy="auto", - stage2_mode=None, - effective_method="hybrid", - execution_method="hybrid", - latency_ms=30.0, - topk_paths=["b"], - first_hit_rank=None, - hit_at_k=False, - recall_at_k=0.0, - generated_artifact_count=0, - test_file_count=2, - error=None, - ), - ] - ) - - assert summary["effective_methods"] == {"fts": 1, "hybrid": 1} - assert summary["runs_with_generated_artifacts"] == 1 - assert summary["runs_with_test_files"] == 1 - assert summary["avg_generated_artifact_count"] == 0.5 - assert summary["avg_test_file_count"] == 1.0 diff --git a/codex-lens/tests/test_config.py b/codex-lens/tests/test_config.py deleted file mode 100644 index d6acb3fa..00000000 --- a/codex-lens/tests/test_config.py +++ /dev/null @@ -1,555 +0,0 @@ -"""Tests for CodexLens configuration system.""" - -import builtins -import json -import logging -import os -import tempfile -from pathlib import Path - -import pytest - -from codexlens.config import ( - WORKSPACE_DIR_NAME, - Config, - WorkspaceConfig, - _default_global_dir, - find_workspace_root, -) -from codexlens.errors import ConfigError - - -class TestDefaultGlobalDir: - """Tests for _default_global_dir function.""" - - def test_default_location(self): - """Test default location is ~/.codexlens.""" - # Clear any environment override - env_backup = os.environ.get("CODEXLENS_DATA_DIR") - if "CODEXLENS_DATA_DIR" in os.environ: - del os.environ["CODEXLENS_DATA_DIR"] - - try: - result = _default_global_dir() - assert result == (Path.home() / ".codexlens").resolve() - finally: - if env_backup is not None: - os.environ["CODEXLENS_DATA_DIR"] = env_backup - - def test_env_override(self): - """Test CODEXLENS_DATA_DIR environment variable override.""" - with tempfile.TemporaryDirectory() as tmpdir: - os.environ["CODEXLENS_DATA_DIR"] = tmpdir - try: - result = _default_global_dir() - assert result == Path(tmpdir).resolve() - finally: - del os.environ["CODEXLENS_DATA_DIR"] - - -class TestFindWorkspaceRoot: - """Tests for find_workspace_root function.""" - - def test_finds_workspace_in_current_dir(self): - """Test finding workspace when .codexlens is in current directory.""" - with tempfile.TemporaryDirectory() as tmpdir: - base = Path(tmpdir) - (base / WORKSPACE_DIR_NAME).mkdir() - - result = find_workspace_root(base) - assert result == base.resolve() - - def test_finds_workspace_in_parent_dir(self): - """Test finding workspace in parent directory.""" - with tempfile.TemporaryDirectory() as tmpdir: - base = Path(tmpdir) - (base / WORKSPACE_DIR_NAME).mkdir() - subdir = base / "src" / "components" - subdir.mkdir(parents=True) - - result = find_workspace_root(subdir) - assert result == base.resolve() - - def test_returns_none_when_not_found(self): - """Test returns None when no workspace found in isolated directory.""" - with tempfile.TemporaryDirectory() as tmpdir: - # Create a deep nested directory to avoid finding user's home .codexlens - isolated = Path(tmpdir) / "a" / "b" / "c" - isolated.mkdir(parents=True) - result = find_workspace_root(isolated) - # May find user's .codexlens if it exists in parent dirs - # So we just check it doesn't find one in our temp directory - if result is not None: - assert WORKSPACE_DIR_NAME not in str(isolated) - - def test_does_not_find_file_as_workspace(self): - """Test that a file named .codexlens is not recognized as workspace.""" - with tempfile.TemporaryDirectory() as tmpdir: - base = Path(tmpdir) - # Create isolated subdirectory - subdir = base / "project" - subdir.mkdir() - (subdir / WORKSPACE_DIR_NAME).write_text("not a directory") - - result = find_workspace_root(subdir) - # Should not find the file as workspace - if result is not None: - assert result != subdir - - -class TestConfig: - """Tests for Config class.""" - - def test_default_config(self): - """Test creating config with defaults.""" - with tempfile.TemporaryDirectory() as tmpdir: - os.environ["CODEXLENS_DATA_DIR"] = tmpdir - try: - config = Config() - assert config.data_dir == Path(tmpdir).resolve() - assert config.venv_path == Path(tmpdir).resolve() / "venv" - finally: - del os.environ["CODEXLENS_DATA_DIR"] - - def test_creates_data_dir(self): - """Test that data_dir is created on init.""" - with tempfile.TemporaryDirectory() as tmpdir: - data_dir = Path(tmpdir) / "new_dir" - config = Config(data_dir=data_dir) - assert data_dir.exists() - - def test_post_init_permission_error_includes_path_and_cause(self, monkeypatch): - """PermissionError during __post_init__ should raise ConfigError with context.""" - with tempfile.TemporaryDirectory() as tmpdir: - data_dir = Path(tmpdir) / "blocked" - venv_path = Path(tmpdir) / "venv" - expected_data_dir = data_dir.expanduser().resolve() - - real_mkdir = Path.mkdir - - def guarded_mkdir(self, *args, **kwargs): - if self == expected_data_dir: - raise PermissionError("Permission denied") - return real_mkdir(self, *args, **kwargs) - - monkeypatch.setattr(Path, "mkdir", guarded_mkdir) - - with pytest.raises(ConfigError) as excinfo: - Config(data_dir=data_dir, venv_path=venv_path) - - message = str(excinfo.value) - assert str(expected_data_dir) in message - assert "permission" in message.lower() - assert "PermissionError" in message - assert isinstance(excinfo.value.__cause__, PermissionError) - - def test_post_init_os_error_includes_path_and_cause(self, monkeypatch): - """OSError during __post_init__ should raise ConfigError with context.""" - with tempfile.TemporaryDirectory() as tmpdir: - data_dir = Path(tmpdir) / "invalid" - venv_path = Path(tmpdir) / "venv" - expected_data_dir = data_dir.expanduser().resolve() - - real_mkdir = Path.mkdir - - def guarded_mkdir(self, *args, **kwargs): - if self == expected_data_dir: - raise OSError("Invalid path") - return real_mkdir(self, *args, **kwargs) - - monkeypatch.setattr(Path, "mkdir", guarded_mkdir) - - with pytest.raises(ConfigError) as excinfo: - Config(data_dir=data_dir, venv_path=venv_path) - - message = str(excinfo.value) - assert str(expected_data_dir) in message - assert "permission" not in message.lower() - assert "filesystem" in message.lower() - assert "OSError" in message - assert isinstance(excinfo.value.__cause__, OSError) - - def test_supported_languages(self): - """Test default supported languages.""" - with tempfile.TemporaryDirectory() as tmpdir: - os.environ["CODEXLENS_DATA_DIR"] = tmpdir - try: - config = Config() - assert "python" in config.supported_languages - assert "javascript" in config.supported_languages - assert "typescript" in config.supported_languages - assert "java" in config.supported_languages - assert "go" in config.supported_languages - assert "swift" in config.supported_languages - finally: - del os.environ["CODEXLENS_DATA_DIR"] - - def test_language_for_path_swift(self): - """Swift (.swift) files should be recognized as code.""" - with tempfile.TemporaryDirectory() as tmpdir: - config = Config(data_dir=Path(tmpdir)) - assert config.language_for_path("x.swift") == "swift" - assert config.language_for_path("X.SWIFT") == "swift" - assert config.category_for_path("x.swift") == "code" - - def test_cache_dir_property(self): - """Test cache_dir property.""" - with tempfile.TemporaryDirectory() as tmpdir: - config = Config(data_dir=Path(tmpdir)) - assert config.cache_dir == Path(tmpdir).resolve() / "cache" - - def test_index_dir_property(self): - """Test index_dir property.""" - with tempfile.TemporaryDirectory() as tmpdir: - config = Config(data_dir=Path(tmpdir)) - assert config.index_dir == Path(tmpdir).resolve() / "index" - - def test_db_path_property(self): - """Test db_path property.""" - with tempfile.TemporaryDirectory() as tmpdir: - config = Config(data_dir=Path(tmpdir)) - assert config.db_path == Path(tmpdir).resolve() / "index" / "codexlens.db" - - def test_ensure_runtime_dirs(self): - """Test ensure_runtime_dirs creates directories.""" - with tempfile.TemporaryDirectory() as tmpdir: - config = Config(data_dir=Path(tmpdir)) - config.ensure_runtime_dirs() - assert config.cache_dir.exists() - assert config.index_dir.exists() - - def test_ensure_runtime_dirs_permission_error_includes_path_and_cause(self, monkeypatch): - """PermissionError during ensure_runtime_dirs should raise ConfigError with context.""" - with tempfile.TemporaryDirectory() as tmpdir: - config = Config(data_dir=Path(tmpdir)) - target_dir = config.cache_dir - - real_mkdir = Path.mkdir - - def guarded_mkdir(self, *args, **kwargs): - if self == target_dir: - raise PermissionError("Permission denied") - return real_mkdir(self, *args, **kwargs) - - monkeypatch.setattr(Path, "mkdir", guarded_mkdir) - - with pytest.raises(ConfigError) as excinfo: - config.ensure_runtime_dirs() - - message = str(excinfo.value) - assert str(target_dir) in message - assert "permission" in message.lower() - assert "PermissionError" in message - assert isinstance(excinfo.value.__cause__, PermissionError) - - def test_ensure_runtime_dirs_os_error_includes_path_and_cause(self, monkeypatch): - """OSError during ensure_runtime_dirs should raise ConfigError with context.""" - with tempfile.TemporaryDirectory() as tmpdir: - config = Config(data_dir=Path(tmpdir)) - target_dir = config.cache_dir - - real_mkdir = Path.mkdir - - def guarded_mkdir(self, *args, **kwargs): - if self == target_dir: - raise OSError("Invalid path") - return real_mkdir(self, *args, **kwargs) - - monkeypatch.setattr(Path, "mkdir", guarded_mkdir) - - with pytest.raises(ConfigError) as excinfo: - config.ensure_runtime_dirs() - - message = str(excinfo.value) - assert str(target_dir) in message - assert "permission" not in message.lower() - assert "filesystem" in message.lower() - assert "OSError" in message - assert isinstance(excinfo.value.__cause__, OSError) - - def test_language_for_path_python(self): - """Test language detection for Python files.""" - with tempfile.TemporaryDirectory() as tmpdir: - os.environ["CODEXLENS_DATA_DIR"] = tmpdir - try: - config = Config() - assert config.language_for_path("test.py") == "python" - assert config.language_for_path("/path/to/file.py") == "python" - finally: - del os.environ["CODEXLENS_DATA_DIR"] - - def test_language_for_path_javascript(self): - """Test language detection for JavaScript files.""" - with tempfile.TemporaryDirectory() as tmpdir: - os.environ["CODEXLENS_DATA_DIR"] = tmpdir - try: - config = Config() - assert config.language_for_path("test.js") == "javascript" - assert config.language_for_path("component.jsx") == "javascript" - finally: - del os.environ["CODEXLENS_DATA_DIR"] - - def test_language_for_path_typescript(self): - """Test language detection for TypeScript files.""" - with tempfile.TemporaryDirectory() as tmpdir: - os.environ["CODEXLENS_DATA_DIR"] = tmpdir - try: - config = Config() - assert config.language_for_path("test.ts") == "typescript" - assert config.language_for_path("component.tsx") == "typescript" - finally: - del os.environ["CODEXLENS_DATA_DIR"] - - def test_language_for_path_unknown(self): - """Test language detection for unknown files.""" - with tempfile.TemporaryDirectory() as tmpdir: - os.environ["CODEXLENS_DATA_DIR"] = tmpdir - try: - config = Config() - assert config.language_for_path("test.xyz") is None - assert config.language_for_path("data.csv") is None - finally: - del os.environ["CODEXLENS_DATA_DIR"] - - def test_language_for_path_case_insensitive(self): - """Test language detection is case insensitive.""" - with tempfile.TemporaryDirectory() as tmpdir: - os.environ["CODEXLENS_DATA_DIR"] = tmpdir - try: - config = Config() - assert config.language_for_path("TEST.PY") == "python" - assert config.language_for_path("File.Js") == "javascript" - finally: - del os.environ["CODEXLENS_DATA_DIR"] - - def test_rules_for_language(self): - """Test getting parsing rules for a language.""" - with tempfile.TemporaryDirectory() as tmpdir: - os.environ["CODEXLENS_DATA_DIR"] = tmpdir - try: - config = Config() - rules = config.rules_for_language("python") - assert "max_chunk_chars" in rules - assert "max_chunk_lines" in rules - assert "overlap_lines" in rules - finally: - del os.environ["CODEXLENS_DATA_DIR"] - - -class TestConfigLoadSettings: - """Tests for Config.load_settings behavior and logging.""" - - def test_load_settings_logs_warning_on_malformed_json(self, caplog): - """Malformed JSON in settings file should trigger warning log.""" - with tempfile.TemporaryDirectory() as tmpdir: - config = Config(data_dir=Path(tmpdir)) - config.settings_path.write_text("{", encoding="utf-8") - - with caplog.at_level(logging.WARNING): - config.load_settings() - - records = [r for r in caplog.records if r.name == "codexlens.config"] - assert any("Failed to load settings from" in r.message for r in records) - assert any("JSONDecodeError" in r.message for r in records) - assert any(str(config.settings_path) in r.message for r in records) - - def test_load_settings_logs_warning_on_permission_error(self, monkeypatch, caplog): - """Permission errors opening settings file should trigger warning log.""" - with tempfile.TemporaryDirectory() as tmpdir: - config = Config(data_dir=Path(tmpdir)) - config.settings_path.write_text("{}", encoding="utf-8") - - real_open = builtins.open - - def guarded_open(path, mode="r", *args, **kwargs): - if Path(path) == config.settings_path and "r" in mode: - raise PermissionError("Permission denied") - return real_open(path, mode, *args, **kwargs) - - monkeypatch.setattr(builtins, "open", guarded_open) - - with caplog.at_level(logging.WARNING): - config.load_settings() - - records = [r for r in caplog.records if r.name == "codexlens.config"] - assert any("Failed to load settings from" in r.message for r in records) - assert any("PermissionError" in r.message for r in records) - - def test_load_settings_loads_valid_settings_without_warning(self, caplog): - """Valid settings should load without warning logs.""" - with tempfile.TemporaryDirectory() as tmpdir: - config = Config(data_dir=Path(tmpdir)) - config.settings_path.write_text( - json.dumps( - { - "embedding": { - "backend": "fastembed", - "model": "multilingual", - "use_gpu": False, - }, - "llm": { - "enabled": True, - "tool": "gemini", - "timeout_ms": 1234, - "batch_size": 7, - }, - } - ), - encoding="utf-8", - ) - - with caplog.at_level(logging.WARNING): - config.load_settings() - - records = [r for r in caplog.records if r.name == "codexlens.config"] - assert not records - assert config.embedding_backend == "fastembed" - assert config.embedding_model == "multilingual" - assert config.embedding_use_gpu is False - assert config.llm_enabled is True - assert config.llm_tool == "gemini" - assert config.llm_timeout_ms == 1234 - assert config.llm_batch_size == 7 - - def test_load_settings_logs_warning_on_invalid_embedding_backend(self, caplog): - """Invalid embedding backend should trigger warning log and keep default.""" - with tempfile.TemporaryDirectory() as tmpdir: - config = Config(data_dir=Path(tmpdir)) - default_backend = config.embedding_backend - config.settings_path.write_text( - json.dumps({"embedding": {"backend": "invalid-backend"}}), - encoding="utf-8", - ) - - with caplog.at_level(logging.WARNING): - config.load_settings() - - records = [r for r in caplog.records if r.name == "codexlens.config"] - assert any("Invalid embedding backend in" in r.message for r in records) - assert config.embedding_backend == default_backend - - -class TestWorkspaceConfig: - """Tests for WorkspaceConfig class.""" - - def test_create_workspace_config(self): - """Test creating a workspace config.""" - with tempfile.TemporaryDirectory() as tmpdir: - workspace = WorkspaceConfig(workspace_root=Path(tmpdir)) - assert workspace.workspace_root == Path(tmpdir).resolve() - - def test_codexlens_dir_property(self): - """Test codexlens_dir property.""" - with tempfile.TemporaryDirectory() as tmpdir: - workspace = WorkspaceConfig(workspace_root=Path(tmpdir)) - assert workspace.codexlens_dir == Path(tmpdir).resolve() / WORKSPACE_DIR_NAME - - def test_db_path_property(self): - """Test db_path property.""" - with tempfile.TemporaryDirectory() as tmpdir: - workspace = WorkspaceConfig(workspace_root=Path(tmpdir)) - expected = Path(tmpdir).resolve() / WORKSPACE_DIR_NAME / "index.db" - assert workspace.db_path == expected - - def test_cache_dir_property(self): - """Test cache_dir property.""" - with tempfile.TemporaryDirectory() as tmpdir: - workspace = WorkspaceConfig(workspace_root=Path(tmpdir)) - expected = Path(tmpdir).resolve() / WORKSPACE_DIR_NAME / "cache" - assert workspace.cache_dir == expected - - def test_initialize_creates_directories(self): - """Test initialize creates .codexlens directory structure.""" - with tempfile.TemporaryDirectory() as tmpdir: - workspace = WorkspaceConfig(workspace_root=Path(tmpdir)) - workspace.initialize() - - assert workspace.codexlens_dir.exists() - assert workspace.cache_dir.exists() - assert (workspace.codexlens_dir / ".gitignore").exists() - - def test_initialize_creates_gitignore(self): - """Test initialize creates .gitignore with correct content.""" - with tempfile.TemporaryDirectory() as tmpdir: - workspace = WorkspaceConfig(workspace_root=Path(tmpdir)) - workspace.initialize() - - gitignore = workspace.codexlens_dir / ".gitignore" - content = gitignore.read_text() - assert "cache/" in content - - def test_exists_false_when_not_initialized(self): - """Test exists returns False when not initialized.""" - with tempfile.TemporaryDirectory() as tmpdir: - workspace = WorkspaceConfig(workspace_root=Path(tmpdir)) - assert not workspace.exists() - - def test_exists_true_when_initialized_with_db(self): - """Test exists returns True when initialized with db.""" - with tempfile.TemporaryDirectory() as tmpdir: - workspace = WorkspaceConfig(workspace_root=Path(tmpdir)) - workspace.initialize() - # Create the db file to simulate full initialization - workspace.db_path.write_text("") - assert workspace.exists() - - def test_from_path_finds_workspace(self): - """Test from_path finds existing workspace.""" - with tempfile.TemporaryDirectory() as tmpdir: - base = Path(tmpdir) - (base / WORKSPACE_DIR_NAME).mkdir() - - workspace = WorkspaceConfig.from_path(base) - assert workspace is not None - assert workspace.workspace_root == base.resolve() - - def test_from_path_returns_none_when_not_found(self): - """Test from_path returns None when no workspace found in isolated directory.""" - with tempfile.TemporaryDirectory() as tmpdir: - # Create isolated directory structure to avoid user's .codexlens - isolated = Path(tmpdir) / "a" / "b" / "c" - isolated.mkdir(parents=True) - workspace = WorkspaceConfig.from_path(isolated) - # May find user's .codexlens if it exists - if workspace is not None: - assert WORKSPACE_DIR_NAME not in str(isolated) - - def test_create_at_initializes_workspace(self): - """Test create_at creates and initializes workspace.""" - with tempfile.TemporaryDirectory() as tmpdir: - workspace = WorkspaceConfig.create_at(Path(tmpdir)) - assert workspace.codexlens_dir.exists() - assert workspace.cache_dir.exists() - - -class TestConfigEdgeCases: - """Edge case tests for configuration.""" - - def test_config_with_path_object(self): - """Test Config accepts Path objects.""" - with tempfile.TemporaryDirectory() as tmpdir: - config = Config(data_dir=Path(tmpdir)) - assert isinstance(config.data_dir, Path) - - def test_config_expands_user_path(self): - """Test Config expands ~ in paths.""" - with tempfile.TemporaryDirectory() as tmpdir: - os.environ["CODEXLENS_DATA_DIR"] = tmpdir - try: - config = Config() - # Just verify it doesn't crash and returns a resolved path - assert config.data_dir.is_absolute() - finally: - del os.environ["CODEXLENS_DATA_DIR"] - - def test_workspace_config_from_subdir(self): - """Test WorkspaceConfig.from_path works from subdirectory.""" - with tempfile.TemporaryDirectory() as tmpdir: - base = Path(tmpdir) - (base / WORKSPACE_DIR_NAME).mkdir() - deep_subdir = base / "a" / "b" / "c" / "d" - deep_subdir.mkdir(parents=True) - - workspace = WorkspaceConfig.from_path(deep_subdir) - assert workspace is not None - assert workspace.workspace_root == base.resolve() diff --git a/codex-lens/tests/test_config_cascade.py b/codex-lens/tests/test_config_cascade.py deleted file mode 100644 index d1c690e0..00000000 --- a/codex-lens/tests/test_config_cascade.py +++ /dev/null @@ -1,155 +0,0 @@ -"""Unit tests for Config cascade settings validation. - -Tests cover: -- Default cascade_strategy value -- Valid cascade strategies accepted by load_settings -- Invalid cascade strategy fallback behavior -- Staged cascade config defaults -""" - -from __future__ import annotations - -import json -import tempfile -from pathlib import Path -from unittest.mock import patch - -import pytest - -from codexlens.config import Config - - -# ============================================================================= -# Test Fixtures -# ============================================================================= - - -@pytest.fixture -def temp_config_dir(): - """Create temporary directory for config data_dir.""" - tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) - yield Path(tmpdir.name) - try: - tmpdir.cleanup() - except (PermissionError, OSError): - pass - - -# ============================================================================= -# Tests: cascade config defaults -# ============================================================================= - - -class TestConfigCascadeDefaults: - """Tests for Config cascade-related defaults and load_settings().""" - - def test_default_cascade_strategy(self, temp_config_dir): - """Default cascade_strategy should be 'binary'.""" - config = Config(data_dir=temp_config_dir) - assert config.cascade_strategy == "binary" - - def test_valid_cascade_strategies(self, temp_config_dir): - """load_settings should accept all valid cascade strategies.""" - valid_strategies = ["binary", "binary_rerank", "dense_rerank", "staged"] - - for strategy in valid_strategies: - config = Config(data_dir=temp_config_dir) - settings = {"cascade": {"strategy": strategy}} - - settings_path = config.settings_path - settings_path.parent.mkdir(parents=True, exist_ok=True) - with open(settings_path, "w", encoding="utf-8") as f: - json.dump(settings, f) - - with patch.object(config, "_apply_env_overrides"): - config.load_settings() - - assert config.cascade_strategy == strategy, ( - f"Strategy '{strategy}' should be accepted" - ) - - def test_invalid_cascade_strategy_fallback(self, temp_config_dir): - """Invalid cascade strategy should keep default (not crash).""" - config = Config(data_dir=temp_config_dir) - settings = {"cascade": {"strategy": "invalid_strategy"}} - - settings_path = config.settings_path - settings_path.parent.mkdir(parents=True, exist_ok=True) - with open(settings_path, "w", encoding="utf-8") as f: - json.dump(settings, f) - - with patch.object(config, "_apply_env_overrides"): - config.load_settings() - - # Should keep the default "binary" strategy - assert config.cascade_strategy == "binary" - - def test_hybrid_cascade_strategy_alias_maps_to_binary_rerank(self, temp_config_dir): - """Hybrid is a backward-compat alias for binary_rerank.""" - config = Config(data_dir=temp_config_dir) - settings = {"cascade": {"strategy": "hybrid"}} - - settings_path = config.settings_path - settings_path.parent.mkdir(parents=True, exist_ok=True) - with open(settings_path, "w", encoding="utf-8") as f: - json.dump(settings, f) - - with patch.object(config, "_apply_env_overrides"): - config.load_settings() - - assert config.cascade_strategy == "binary_rerank" - - def test_staged_config_defaults(self, temp_config_dir): - """Staged cascade settings should have correct defaults.""" - config = Config(data_dir=temp_config_dir) - assert config.staged_coarse_k == 200 - assert config.staged_lsp_depth == 2 - assert config.staged_stage2_mode == "precomputed" - assert config.staged_clustering_strategy == "auto" - assert config.staged_clustering_min_size == 3 - assert config.enable_staged_rerank is True - assert config.cascade_coarse_k == 100 - assert config.cascade_fine_k == 10 - - def test_staged_settings_load_from_settings_json(self, temp_config_dir): - """load_settings should load staged.* settings when present.""" - config = Config(data_dir=temp_config_dir) - settings = { - "staged": { - "coarse_k": 250, - "lsp_depth": 3, - "stage2_mode": "static_global_graph", - "realtime_lsp_timeout_s": 11.0, - "realtime_lsp_depth": 2, - "realtime_lsp_max_nodes": 42, - "realtime_lsp_max_seeds": 2, - "realtime_lsp_max_concurrent": 4, - "realtime_lsp_warmup_s": 0.5, - "realtime_lsp_resolve_symbols": True, - "clustering_strategy": "path", - "clustering_min_size": 7, - "enable_rerank": False, - } - } - - settings_path = config.settings_path - settings_path.parent.mkdir(parents=True, exist_ok=True) - with open(settings_path, "w", encoding="utf-8") as f: - json.dump(settings, f) - - with patch.object(config, "_apply_env_overrides"): - config.load_settings() - - assert config.staged_coarse_k == 250 - assert config.staged_lsp_depth == 3 - assert config.staged_stage2_mode == "static_global_graph" - assert config.staged_realtime_lsp_timeout_s == 11.0 - assert config.staged_realtime_lsp_depth == 2 - assert config.staged_realtime_lsp_max_nodes == 42 - assert config.staged_realtime_lsp_max_seeds == 2 - assert config.staged_realtime_lsp_max_concurrent == 4 - assert config.staged_realtime_lsp_warmup_s == 0.5 - assert config.staged_realtime_lsp_resolve_symbols is True - assert config.staged_clustering_strategy == "path" - assert config.staged_clustering_min_size == 7 - assert config.enable_staged_rerank is False diff --git a/codex-lens/tests/test_config_ignore_patterns.py b/codex-lens/tests/test_config_ignore_patterns.py deleted file mode 100644 index 5cc356a4..00000000 --- a/codex-lens/tests/test_config_ignore_patterns.py +++ /dev/null @@ -1,25 +0,0 @@ -from __future__ import annotations - -import json -from pathlib import Path - -from codexlens.config import Config - - -def test_load_settings_reads_ignore_patterns_and_extension_filters(tmp_path: Path) -> None: - settings_path = tmp_path / "settings.json" - settings_path.write_text( - json.dumps( - { - "ignore_patterns": ["frontend/dist", "coverage"], - "extension_filters": ["*.min.js", "*.map"], - } - ), - encoding="utf-8", - ) - - config = Config(data_dir=tmp_path) - config.load_settings() - - assert config.ignore_patterns == ["frontend/dist", "coverage"] - assert config.extension_filters == ["*.min.js", "*.map"] diff --git a/codex-lens/tests/test_config_search_env_overrides.py b/codex-lens/tests/test_config_search_env_overrides.py deleted file mode 100644 index f49d2880..00000000 --- a/codex-lens/tests/test_config_search_env_overrides.py +++ /dev/null @@ -1,83 +0,0 @@ -"""Unit tests for Config .env overrides for final search ranking penalties.""" - -from __future__ import annotations - -import tempfile -from pathlib import Path - -import pytest - -from codexlens.config import Config - - -@pytest.fixture -def temp_config_dir() -> Path: - """Create temporary directory for config data_dir.""" - tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) - yield Path(tmpdir.name) - try: - tmpdir.cleanup() - except (PermissionError, OSError): - pass - - -def test_search_penalty_env_overrides_apply(temp_config_dir: Path) -> None: - config = Config(data_dir=temp_config_dir) - - env_path = temp_config_dir / ".env" - env_path.write_text( - "\n".join( - [ - "TEST_FILE_PENALTY=0.25", - "GENERATED_FILE_PENALTY=0.4", - "", - ] - ), - encoding="utf-8", - ) - - config.load_settings() - - assert config.test_file_penalty == 0.25 - assert config.generated_file_penalty == 0.4 - - -def test_reranker_gpu_env_override_apply(temp_config_dir: Path) -> None: - config = Config(data_dir=temp_config_dir) - - env_path = temp_config_dir / ".env" - env_path.write_text( - "\n".join( - [ - "RERANKER_USE_GPU=false", - "", - ] - ), - encoding="utf-8", - ) - - config.load_settings() - - assert config.reranker_use_gpu is False - - -def test_search_penalty_env_overrides_invalid_ignored(temp_config_dir: Path) -> None: - config = Config(data_dir=temp_config_dir) - - env_path = temp_config_dir / ".env" - env_path.write_text( - "\n".join( - [ - "TEST_FILE_PENALTY=oops", - "GENERATED_FILE_PENALTY=nope", - "", - ] - ), - encoding="utf-8", - ) - - config.load_settings() - - assert config.test_file_penalty == 0.15 - assert config.generated_file_penalty == 0.35 - assert config.reranker_use_gpu is True diff --git a/codex-lens/tests/test_config_staged_env_overrides.py b/codex-lens/tests/test_config_staged_env_overrides.py deleted file mode 100644 index cfe9555a..00000000 --- a/codex-lens/tests/test_config_staged_env_overrides.py +++ /dev/null @@ -1,136 +0,0 @@ -"""Unit tests for Config .env overrides for staged/cascade settings.""" - -from __future__ import annotations - -import tempfile -from pathlib import Path - -import pytest - -from codexlens.config import Config - - -@pytest.fixture -def temp_config_dir() -> Path: - """Create temporary directory for config data_dir.""" - tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) - yield Path(tmpdir.name) - try: - tmpdir.cleanup() - except (PermissionError, OSError): - pass - - -def test_staged_env_overrides_apply(temp_config_dir: Path) -> None: - config = Config(data_dir=temp_config_dir) - - env_path = temp_config_dir / ".env" - env_path.write_text( - "\n".join( - [ - "ENABLE_CASCADE_SEARCH=true", - "CASCADE_STRATEGY=staged", - "CASCADE_COARSE_K=111", - "CASCADE_FINE_K=7", - "STAGED_STAGE2_MODE=realtime", - "STAGED_CLUSTERING_STRATEGY=path", - "STAGED_CLUSTERING_MIN_SIZE=5", - "ENABLE_STAGED_RERANK=false", - "STAGED_REALTIME_LSP_TIMEOUT_S=12.5", - "STAGED_REALTIME_LSP_DEPTH=2", - "STAGED_REALTIME_LSP_MAX_NODES=123", - "STAGED_REALTIME_LSP_MAX_SEEDS=3", - "STAGED_REALTIME_LSP_MAX_CONCURRENT=4", - "STAGED_REALTIME_LSP_WARMUP_S=0.25", - "STAGED_REALTIME_LSP_RESOLVE_SYMBOLS=yes", - "", - ] - ), - encoding="utf-8", - ) - - config.load_settings() - - assert config.enable_cascade_search is True - assert config.cascade_strategy == "staged" - assert config.cascade_coarse_k == 111 - assert config.cascade_fine_k == 7 - - assert config.staged_stage2_mode == "realtime" - assert config.staged_clustering_strategy == "path" - assert config.staged_clustering_min_size == 5 - assert config.enable_staged_rerank is False - assert config.staged_realtime_lsp_timeout_s == 12.5 - assert config.staged_realtime_lsp_depth == 2 - assert config.staged_realtime_lsp_max_nodes == 123 - assert config.staged_realtime_lsp_max_seeds == 3 - assert config.staged_realtime_lsp_max_concurrent == 4 - assert config.staged_realtime_lsp_warmup_s == 0.25 - assert config.staged_realtime_lsp_resolve_symbols is True - - -def test_staged_env_overrides_prefixed_wins(temp_config_dir: Path) -> None: - config = Config(data_dir=temp_config_dir) - - env_path = temp_config_dir / ".env" - env_path.write_text( - "\n".join( - [ - "STAGED_CLUSTERING_STRATEGY=score", - "CODEXLENS_STAGED_CLUSTERING_STRATEGY=path", - "STAGED_STAGE2_MODE=precomputed", - "CODEXLENS_STAGED_STAGE2_MODE=realtime", - "", - ] - ), - encoding="utf-8", - ) - - config.load_settings() - - assert config.staged_clustering_strategy == "path" - assert config.staged_stage2_mode == "realtime" - - -def test_staged_env_overrides_invalid_ignored(temp_config_dir: Path) -> None: - config = Config(data_dir=temp_config_dir) - - env_path = temp_config_dir / ".env" - env_path.write_text( - "\n".join( - [ - "STAGED_STAGE2_MODE=bogus", - "STAGED_CLUSTERING_STRATEGY=embedding_remote", - "STAGED_REALTIME_LSP_TIMEOUT_S=nope", - "CASCADE_STRATEGY=???", - "", - ] - ), - encoding="utf-8", - ) - - config.load_settings() - - assert config.cascade_strategy == "binary" - assert config.staged_stage2_mode == "precomputed" - assert config.staged_clustering_strategy == "auto" - assert config.staged_realtime_lsp_timeout_s == 30.0 - - -def test_cascade_strategy_hybrid_alias_env_override(temp_config_dir: Path) -> None: - config = Config(data_dir=temp_config_dir) - - env_path = temp_config_dir / ".env" - env_path.write_text( - "\n".join( - [ - "CASCADE_STRATEGY=hybrid", - "", - ] - ), - encoding="utf-8", - ) - - config.load_settings() - - assert config.cascade_strategy == "binary_rerank" diff --git a/codex-lens/tests/test_deepwiki_store.py b/codex-lens/tests/test_deepwiki_store.py deleted file mode 100644 index 1d58a8bb..00000000 --- a/codex-lens/tests/test_deepwiki_store.py +++ /dev/null @@ -1,410 +0,0 @@ -"""Unit tests for DeepWikiStore.""" - -from __future__ import annotations - -import hashlib -import tempfile -from datetime import datetime -from pathlib import Path - -import pytest - -from codexlens.storage.deepwiki_store import DeepWikiStore -from codexlens.storage.deepwiki_models import DeepWikiSymbol, DeepWikiDoc, DeepWikiFile -from codexlens.errors import StorageError - - -from codexlens.storage.deepwiki_store import DeepWikiStore - - -from codexlens.storage.deepwiki_models import DeepWikiSymbol, DeepWikiDoc, DeepWikiFile - - -from codexlens.errors import StorageError - - -import pytest - - -from codexlens.storage.deepwiki_store import DeepWikiStore -from codexlens.storage.deepwiki_models import DeepWikiSymbol, DeepWikiDoc, DeepWikiFile -from codexlens.errors import StorageError - -from pathlib import Path -import tempfile - - -from datetime import datetime - - -from codexlens.storage.deepwiki_store import DeepWikiStore -from codexlens.storage.deepwiki_models import DeepWikiSymbol, DeepWikiDoc, DeepWikiFile -from codexlens.errors import StorageError - - -import os - -@pytest.fixture -def temp_db_path(tmp_path): - """Create a temporary database file.""" - db_file = tmp_path / "deepwiki_test.db" - return str(db_file) - - return DeepWikiStore(db_path=db_file) - - - def test_initialize_creates_schema(self): - store = DeepWikiStore(db_path=db_file) - assert Path.exists(db_file) - assert store.db_path == to str(db_file) - with store: - conn = store._get_connection() - - # Check schema was created - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='deepwiki_files'" - ).fetchone() - assert cursor is not None - - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='deepwiki_docs'" - ).fetchone() - assert cursor is not None - - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='deepwiki_symbols'" - ).fetchone() - assert cursor is not None - - # Check deepwiki_schema table - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='deepwiki_schema'" - ).fetchone() - assert cursor is not None - - # Verify version was inserted - row = cursor.execute( - "SELECT version FROM deepwiki_schema" - ).fetchone() - assert row is not None - assert row["version"] == 1 - - # Check deepwiki_files table - cursor = conn.execute( - "PRAGMA table_info(deepwiki_files)" - ).fetchall() - expected_columns = {"id", "path", "content_hash", "last_indexed", "symbols_count", "docs_generated"} - assert expected_columns == {"id", "path", "content_hash", "last_indexed", "symbols_count", "docs_generated"} - assert len(expected_columns) == 4 - - # Check deepwiki_docs table - cursor = conn.execute( - "PRAGMA table_info(deepwiki_docs)" - ).fetchall() - expected_columns = {"id", "path", "content_hash", "symbols", "generated_at", "llm_tool"} - assert len(expected_columns) == 6 - - # Check deepwiki_symbols table - cursor = conn.execute( - "PRAGMA table_info(deepwiki_symbols)" - ).fetchall() - expected_columns == { - "id", - "name", - "type", - "source_file", - "doc_file", - "anchor", - "start_line", - "end_line", - "created_at", - "updated_at", - } - assert len(expected_columns) == 12 - - # Check indexes - for idx_name in ["idx_deepwiki_files_path", "idx_deepwiki_files_hash", - "idx_deepwiki_docs_path", "idx_deepwiki_symbols_name", - "idx_deepwiki_symbols_source", "idx_deepwiki_symbols_doc"]: - assert cursor is not None - - def test_add_file(self, temp_db_path): - """Test add_file creates a file record.""" - store = DeepWikiStore(db_path=db_file) - test_file = tmp_path / "test_file.py" - content = "test file content" - store.add_file(test_file) - - # Verify file was added - retrieved_file = store.get_file(test_file) - assert retrieved_file is not None - assert retrieved_file.path == str(test_file) - assert retrieved_file.content_hash == content_hash - assert retrieved_file.symbols_count == 1 - assert retrieved_file.docs_generated is False - - # Verify last_indexed - assert retrieved_file.last_indexed is not None - assert isinstance(retrieved_file.last_indexed, datetime) - - - # Verify symbols_count was updated - assert retrieved_file.symbols_count == 1 - - def test_get_file_hash(self, temp_db_path): - """Test get_file_hash returns correct hash.""" - test_file = tmp_path / "test_hash.py" - content_hash = store.compute_file_hash(test_file) - - # File not in DB yet - retrieved_hash = store.get_file_hash(test_file) - assert retrieved_hash is None - - # Create the test file - test_file2 = tmp_path / "test_file2.py" - test_file2.write_text("test file 2") - store.add_file(test_file2) - - # Now get_file_hash should work - retrieved_hash2 = store.get_file_hash(test_file2) - assert retrieved_hash2 is not None - assert retrieved_hash2 == content_hash - - # Verify get_file_hash returns None for unknown file - unknown_file = tmp_path / "unknown_file.txt" - retrieved_hash = store.get_file_hash(unknown_file) - assert retrieved_hash is None - - def test_get_symbols_for_file(self, temp_db_path): - """Test get_symbols_for_file returns symbols for a source file.""" - test_file = tmp_path / "test_source.py" - content = """Test source file with multiple symbols.""" -def test(source_file: str) -> Path: - return Path(source_file) - - # Create test file with multiple symbols - store.add_file(test_file) - for i in range(3): - symbols_data.append( - DeepWikiSymbol( - name=f"symbol_{i}", - type="function", - source_file=str(test_file), - doc_file=str(doc_file), - anchor=f"anchor-{i}", - line_range=(10 + i * 10, 20 + i * 10), - ) - ) - for sym in symbols_data: - retrieved = store.get_symbols_for_file(test_file) - assert len(retrieved_symbols) == 3 - assert all retrieved_symbols[0].source_file == str(test_file) - assert retrieved_symbols[0].line_range == (10, 20) - assert retrieved_symbols[0].doc_file == str(doc_file) - - # Verify first symbol has correct line_range - symbol = retrieved_symbols[0] - assert isinstance(symbol.line_range, tuple) - assert symbol.line_range[0] == 10 - assert symbol.line_range[1] == 20 - - # Verify get_file returns None for unknown file - retrieved_file = store.get_file(str(tmp_path / "nonexistent.py")) - assert retrieved_file is None - - def test_update_file_hash(self, temp_db_path): - """Test update_file_hash updates the hash for a tracked file.""" - test_file = tmp_path / "test_source.py" - content = """Test source file for update_file_hash.""" -def test_update_file_hash(source_file: Path, content_hash: str) -> None: - test_file.write_text("test file content") - store.add_file(test_file) - content_hash = store.compute_file_hash(test_file) - - # Update the hash - store.update_file_hash(test_file, content_hash) - - # Verify hash was updated - retrieved_hash = store.get_file_hash(test_file) - assert retrieved_hash == content_hash - - # Verify update with unchanged hash does nothing - store.update_file_hash(test_file, content_hash) - retrieved_hash2 = store.get_file_hash(test_file) - assert retrieved_hash == content_hash - - def test_remove_file(self, temp_db_path): - """Test remove_file removes file and associated symbols.""" - test_file = tmp_path / "test_source.py" - content = """Test source file for remove_file.""" - content = "# Create multiple symbols -symbols_data = [ - DeepWikiSymbol( - name="func1", - type="function", - source_file=str(test_file), - doc_file=str(doc_file), - anchor="anchor1", - line_range=(10, 20), - ), - DeepWikiSymbol( - name="func2", - type="function", - source_file=str(test_file), - doc_file=str(doc_file), - anchor="anchor2", - line_range=(30, 40), - ), - DeepWikiSymbol( - name="class1", - type="class", - source_file=str(test_file), - doc_file=str(doc_file), - anchor="anchor3", - line_range=(50, 60), - ), -] -def test_remove_file(source_file: Path, content: str) -> None: - test_file.write_text("test file content") - content_hash = store.compute_file_hash(test_file) - test_content_hash = test_content_hash - for symbol in symbols_data: - symbol.content_hash = test_content_hash - assert symbol.content_hash == content_hash - - # Add file to store - store.add_file(test_file) - symbols_data.append(symbol) - - # Add symbols - for symbol in symbols_data: - store.add_symbol(symbol) - - # Verify symbols were added - retrieved_symbols = store.get_symbols_for_file(test_file) - assert len(retrieved_symbols) == 3 - - # Verify first symbol - assert retrieved_symbols[0].name == "func1" - assert retrieved_symbols[0].type == "function" - assert retrieved_symbols[0].source_file == str(test_file) - assert retrieved_symbols[0].doc_file == str(doc_file) - assert retrieved_symbols[0].anchor == "anchor1" - assert retrieved_symbols[0].line_range == (10, 20) - - # Verify second symbol - assert retrieved_symbols[1].name == "func2" - assert retrieved_symbols[1].type == "function" - assert retrieved_symbols[1].source_file == str(test_file) - assert retrieved_symbols[1].doc_file == str(doc_file) - assert retrieved_symbols[1].anchor == "anchor2" - assert retrieved_symbols[1].line_range == (30, 40) - - # Verify third symbol - assert retrieved_symbols[2].name == "class1" - assert retrieved_symbols[2].type == "class" - assert retrieved_symbols[2].source_file == str(test_file) - assert retrieved_symbols[2].doc_file == str(doc_file) - assert retrieved_symbols[2].anchor == "anchor3" - assert retrieved_symbols[2].line_range == (50, 60) - - - # Verify remove_file deleted file and symbols - assert store.remove_file(test_file) is True - - # Verify symbols were deleted - remaining_symbols = store.get_symbols_for_file(test_file) - assert len(remaining_symbols) == 0 - - # Verify file was removed from database - with store: - conn = store._get_connection() - cursor = conn.execute( - "SELECT * FROM deepwiki_files WHERE path=?", - (str(test_file),) - ).fetchone() - assert cursor.fetchone() is None - - def test_compute_file_hash(self, temp_db_path): - """Test compute_file_hash returns correct SHA256 hash.""" - test_file = tmp_path / "test_hash.py" - content = """Test compute_file_hash.""" -def test_compute_file_hash(): - """Create a test file with known content.""" - test_file = tmp_path / "test_content.txt" - test_file.write_text("test content for hashing") - - # Compute hash - store = DeepWikiStore(db_path=temp_db_path) - computed_hash = store.compute_file_hash(test_file) - - assert computed_hash == "a" * 64 + 1" * 64 + 1" * 64 + 1" * 64 + 1" * 64 + 2" * 64 + 3" * 64 + 4" * 64 + 5" * 64 + 6" * 64 + 7" * 64 + 8" * 64 + 9" * 64 + "a" * 64 + "b" * 64 + 1" * 64 + 2" * 64 + 3" * 64 + 4" * 64 + 5" * 64 + 6" * 64 + 7" * 64 + 8" * 64 + 9" * 64 + "\n") - expected_hash = "a" * 64 + "b" * 64 + 1" * 64 + 2" * 64 + 3" * 64 + 4" * 64 + 5" * 64 + 6" * 64 + 7" * 64 + 8" * 64 + 9" * 64 - + hashlib.sha256(test_file.read_bytes()).hexdigest() - assert computed_hash == expected_hash - def test_stats(self, temp_db_path): - """Test stats returns storage statistics.""" - test_file = tmp_path / "test_stats.py" - content = """Test stats.""" -def test_stats(): - store = DeepWikiStore(db_path=temp_db_path) - store.initialize() - - stats = store.stats() - - assert stats["files"] == 1 - assert stats["symbols"] == 0 - assert stats["docs"] == 0 - assert stats["files_needing_docs"] == 1 - assert stats["db_path"] == str(temp_db_path / "deepwiki_test.db") - - # Close store - store.close() - - - # Verify files count - assert stats["files"] == 1 - # Verify symbols count - assert stats["symbols"] == 0 - # Verify docs count - assert stats["docs"] == 0 - # Verify files_needing_docs count - assert stats["files_needing_docs"] == 1 - # Verify db_path - assert stats["db_path"] == str(temp_db_path / "deepwiki_test.db") - - -def test_deepwiki_store_error_handling(): - """Test that DeepWikiStore handles Storage errors properly.""" - store = DeepWikiStore(db_path=temp_db_path) - - with pytest.raises(StorageError): - store._create_schema(conn) - - with pytest.raises(StorageError): - store.add_symbol( - DeepWikiSymbol( - name="test", - type="function", - source_file="test.py", - doc_file="test.md", - anchor="test-anchor", - line_range=(1, 10), - ) - ) - - # Test error handling on missing file - os.remove(test_file) - store.add_file(test_file) - - with pytest.raises(FileNotFoundError): - store.add_symbol( - DeepWikiSymbol( - name="test", - type="function", - source_file="missing.py", - doc_file="test.md", - anchor="test-anchor", - line_range=(1, 10), - ) - ) diff --git a/codex-lens/tests/test_deepwiki_types.py b/codex-lens/tests/test_deepwiki_types.py deleted file mode 100644 index 8c5a1c8f..00000000 --- a/codex-lens/tests/test_deepwiki_types.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Unit tests for DeepWiki TypeScript types matching.""" - -from __future__ import annotations - -from pathlib import Path - -from ccw.src.types.deepwiki import ( - DeepWikiSymbol, - DeepWikiDoc, - DeepWikiFile, - DeepWikiStorageStats, -) - - diff --git a/codex-lens/tests/test_dual_fts.py b/codex-lens/tests/test_dual_fts.py deleted file mode 100644 index 1c3e1cb7..00000000 --- a/codex-lens/tests/test_dual_fts.py +++ /dev/null @@ -1,612 +0,0 @@ -"""Tests for Dual-FTS schema migration and functionality (P1). - -Tests dual FTS tables (files_fts_exact, files_fts_fuzzy) creation, trigger synchronization, -and migration from schema version 2 to version 4. -""" - -import sqlite3 -import tempfile -from pathlib import Path - -import pytest - -from codexlens.storage.dir_index import DirIndexStore - -# Check if pytest-benchmark is available -try: - import pytest_benchmark - BENCHMARK_AVAILABLE = True -except ImportError: - BENCHMARK_AVAILABLE = False - - -class TestDualFTSSchema: - """Tests for dual FTS schema creation and structure.""" - - @pytest.fixture - def temp_db(self): - """Create temporary database for testing.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - yield db_path - # Cleanup - if db_path.exists(): - db_path.unlink() - - @pytest.fixture - def index_store(self, temp_db): - """Create DirIndexStore with initialized database.""" - store = DirIndexStore(temp_db) - store.initialize() - yield store - store.close() - - def test_files_fts_exact_table_exists(self, index_store): - """Test files_fts_exact FTS5 table is created.""" - with index_store._get_connection() as conn: - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='files_fts_exact'" - ) - result = cursor.fetchone() - assert result is not None, "files_fts_exact table should exist" - - def test_files_fts_fuzzy_table_exists(self, index_store): - """Test files_fts_fuzzy FTS5 table is created with trigram tokenizer.""" - with index_store._get_connection() as conn: - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='files_fts_fuzzy'" - ) - result = cursor.fetchone() - assert result is not None, "files_fts_fuzzy table should exist" - - def test_fts_exact_tokenizer(self, index_store): - """Test files_fts_exact uses unicode61 tokenizer.""" - with index_store._get_connection() as conn: - # Check table creation SQL - cursor = conn.execute( - "SELECT sql FROM sqlite_master WHERE name='files_fts_exact'" - ) - result = cursor.fetchone() - assert result is not None - sql = result[0] - # Should use unicode61 tokenizer - assert "unicode61" in sql.lower() or "fts5" in sql.lower() - - def test_fts_fuzzy_tokenizer_fallback(self, index_store): - """Test files_fts_fuzzy uses trigram or falls back to unicode61.""" - with index_store._get_connection() as conn: - cursor = conn.execute( - "SELECT sql FROM sqlite_master WHERE name='files_fts_fuzzy'" - ) - result = cursor.fetchone() - assert result is not None - sql = result[0] - # Should use trigram or unicode61 as fallback - assert "trigram" in sql.lower() or "unicode61" in sql.lower() - - def test_dual_fts_trigger_synchronization(self, index_store, temp_db): - """Test triggers keep dual FTS tables synchronized with files table.""" - # Insert test file - test_path = "test/example.py" - test_content = "def test_function():\n pass" - - with index_store._get_connection() as conn: - # Insert into files table - name = test_path.split('/')[-1] - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, test_path, test_content, "python", 1234567890.0) - ) - conn.commit() - - # Check files_fts_exact has content - cursor = conn.execute( - "SELECT full_path, content FROM files_fts_exact WHERE full_path = ?", - (test_path,) - ) - exact_result = cursor.fetchone() - assert exact_result is not None, "files_fts_exact should have content via trigger" - assert exact_result[0] == test_path - assert exact_result[1] == test_content - - # Check files_fts_fuzzy has content - cursor = conn.execute( - "SELECT full_path, content FROM files_fts_fuzzy WHERE full_path = ?", - (test_path,) - ) - fuzzy_result = cursor.fetchone() - assert fuzzy_result is not None, "files_fts_fuzzy should have content via trigger" - assert fuzzy_result[0] == test_path - assert fuzzy_result[1] == test_content - - def test_dual_fts_update_trigger(self, index_store): - """Test UPDATE triggers synchronize dual FTS tables.""" - test_path = "test/update.py" - original_content = "original content" - updated_content = "updated content" - - with index_store._get_connection() as conn: - # Insert - name = test_path.split('/')[-1] - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, test_path, original_content, "python", 1234567890.0) - ) - conn.commit() - - # Update content - conn.execute( - "UPDATE files SET content = ? WHERE full_path = ?", - (updated_content, test_path) - ) - conn.commit() - - # Verify FTS tables have updated content - cursor = conn.execute( - "SELECT content FROM files_fts_exact WHERE full_path = ?", - (test_path,) - ) - assert cursor.fetchone()[0] == updated_content - - cursor = conn.execute( - "SELECT content FROM files_fts_fuzzy WHERE full_path = ?", - (test_path,) - ) - assert cursor.fetchone()[0] == updated_content - - def test_dual_fts_delete_trigger(self, index_store): - """Test DELETE triggers remove entries from dual FTS tables.""" - test_path = "test/delete.py" - - with index_store._get_connection() as conn: - # Insert - name = test_path.split('/')[-1] - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, test_path, "content", "python", 1234567890.0) - ) - conn.commit() - - # Delete - conn.execute("DELETE FROM files WHERE full_path = ?", (test_path,)) - conn.commit() - - # Verify FTS tables are cleaned up - cursor = conn.execute( - "SELECT COUNT(*) FROM files_fts_exact WHERE full_path = ?", - (test_path,) - ) - assert cursor.fetchone()[0] == 0 - - cursor = conn.execute( - "SELECT COUNT(*) FROM files_fts_fuzzy WHERE full_path = ?", - (test_path,) - ) - assert cursor.fetchone()[0] == 0 - - -class TestDualFTSMigration: - """Tests for schema migration to dual FTS (v2 → v4).""" - - @pytest.fixture - def v2_db(self): - """Create schema version 2 database (pre-dual-FTS).""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - # Create v2 schema manually - conn = sqlite3.connect(db_path) - try: - # Set schema version using PRAGMA (not schema_version table) - conn.execute("PRAGMA user_version = 2") - - conn.executescript(""" - CREATE TABLE IF NOT EXISTS files ( - path TEXT PRIMARY KEY, - content TEXT, - language TEXT, - indexed_at TEXT - ); - - CREATE VIRTUAL TABLE IF NOT EXISTS files_fts USING fts5( - path, content, language, - content='files', content_rowid='rowid' - ); - """) - conn.commit() - finally: - conn.close() - - yield db_path - - # Cleanup - if db_path.exists(): - db_path.unlink() - - def test_migration_004_creates_dual_fts(self, v2_db): - """Test migration 004 creates dual FTS tables.""" - # Run migration - store = DirIndexStore(v2_db) - store.initialize() - - try: - # Verify tables exist - with store._get_connection() as conn: - cursor = conn.execute( - """SELECT name FROM sqlite_master - WHERE type='table' AND name IN ('files_fts_exact', 'files_fts_fuzzy')""" - ) - tables = [row[0] for row in cursor.fetchall()] - assert 'files_fts_exact' in tables, "Migration should create files_fts_exact" - assert 'files_fts_fuzzy' in tables, "Migration should create files_fts_fuzzy" - finally: - store.close() - - def test_migration_004_preserves_data(self, v2_db): - """Test migration preserves existing file data.""" - # Insert test data into v2 schema (using 'path' column) - conn = sqlite3.connect(v2_db) - test_files = [ - ("test/file1.py", "content1", "python"), - ("test/file2.js", "content2", "javascript"), - ] - conn.executemany( - "INSERT INTO files (path, content, language) VALUES (?, ?, ?)", - test_files - ) - conn.commit() - conn.close() - - # Run migration - store = DirIndexStore(v2_db) - store.initialize() - - try: - # Verify data preserved (should be migrated to full_path) - with store._get_connection() as conn: - cursor = conn.execute("SELECT full_path, content, language FROM files ORDER BY full_path") - result = [tuple(row) for row in cursor.fetchall()] - assert len(result) == 2 - assert result[0] == test_files[0] - assert result[1] == test_files[1] - finally: - store.close() - - def test_migration_004_updates_schema_version(self, v2_db): - """Test migration updates schema_version to 4.""" - # Run migration - store = DirIndexStore(v2_db) - store.initialize() - - try: - with store._get_connection() as conn: - # Check PRAGMA user_version (not schema_version table) - cursor = conn.execute("PRAGMA user_version") - version = cursor.fetchone()[0] - assert version >= 4, "Schema version should be upgraded to 4" - finally: - store.close() - - def test_migration_idempotent(self, v2_db): - """Test migration can run multiple times safely.""" - # Run migration twice - store1 = DirIndexStore(v2_db) - store1.initialize() # First migration - store1.close() - - store2 = DirIndexStore(v2_db) - store2.initialize() # Second migration (should be idempotent) - - try: - # Should not raise errors - with store2._get_connection() as conn: - cursor = conn.execute("SELECT COUNT(*) FROM files_fts_exact") - # Should work without errors - cursor.fetchone() - finally: - store2.close() - - -class TestTrigramAvailability: - """Tests for trigram tokenizer availability and fallback.""" - - @pytest.fixture - def temp_db(self): - """Create temporary database.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - yield db_path - if db_path.exists(): - db_path.unlink() - - def test_trigram_detection(self, temp_db): - """Test system detects trigram tokenizer availability.""" - store = DirIndexStore(temp_db) - store.initialize() - - try: - # Check SQLite version and trigram support - with store._get_connection() as conn: - cursor = conn.execute("SELECT sqlite_version()") - version = cursor.fetchone()[0] - print(f"SQLite version: {version}") - - # Try to create trigram FTS table - try: - conn.execute(""" - CREATE VIRTUAL TABLE test_trigram USING fts5( - content, - tokenize='trigram' - ) - """) - trigram_available = True - except sqlite3.OperationalError: - trigram_available = False - - # Cleanup test table - if trigram_available: - conn.execute("DROP TABLE IF EXISTS test_trigram") - - # Verify fuzzy table uses appropriate tokenizer - with store._get_connection() as conn: - cursor = conn.execute( - "SELECT sql FROM sqlite_master WHERE name='files_fts_fuzzy'" - ) - result = cursor.fetchone() - assert result is not None - sql = result[0] - - if trigram_available: - assert "trigram" in sql.lower(), "Should use trigram when available" - else: - # Should fallback to unicode61 - assert "unicode61" in sql.lower() or "fts5" in sql.lower() - finally: - store.close() - - -@pytest.mark.benchmark -class TestDualFTSPerformance: - """Benchmark tests for dual FTS overhead.""" - - @pytest.fixture - def populated_db(self): - """Create database with test files.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - store = DirIndexStore(db_path) - store.initialize() - - # Insert 100 test files - with store._get_connection() as conn: - for i in range(100): - path = f"test/file{i}.py" - name = f"file{i}.py" - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, path, f"def function{i}():\n pass", "python", 1234567890.0) - ) - conn.commit() - - # Close store before yielding to avoid conflicts - store.close() - - yield db_path - - # Cleanup - if db_path.exists(): - db_path.unlink() - - @pytest.mark.skipif(not BENCHMARK_AVAILABLE, reason="pytest-benchmark not installed") - def test_insert_overhead(self, populated_db, benchmark): - """Benchmark INSERT overhead with dual FTS triggers.""" - store = DirIndexStore(populated_db) - store.initialize() - - try: - def insert_file(): - with store._get_connection() as conn: - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - ("test.py", "benchmark/test.py", "content", "python", 1234567890.0) - ) - conn.commit() - # Cleanup - conn.execute("DELETE FROM files WHERE full_path = 'benchmark/test.py'") - conn.commit() - - # Should complete in reasonable time (<100ms) - result = benchmark(insert_file) - assert result < 0.1 # 100ms - finally: - store.close() - - def test_search_fts_exact(self, populated_db): - """Test search on files_fts_exact returns results.""" - store = DirIndexStore(populated_db) - store.initialize() - - try: - with store._get_connection() as conn: - # Search for "def" which is a complete token in all files - cursor = conn.execute( - """SELECT full_path, bm25(files_fts_exact) as score - FROM files_fts_exact - WHERE files_fts_exact MATCH 'def' - ORDER BY score - LIMIT 10""" - ) - results = cursor.fetchall() - assert len(results) > 0, "Should find matches in exact FTS" - # Verify BM25 scores (negative = better) - for full_path, score in results: - assert score < 0, "BM25 scores should be negative" - finally: - store.close() - - def test_search_fts_fuzzy(self, populated_db): - """Test search on files_fts_fuzzy returns results.""" - store = DirIndexStore(populated_db) - store.initialize() - - try: - with store._get_connection() as conn: - # Search for "def" which is a complete token in all files - cursor = conn.execute( - """SELECT full_path, bm25(files_fts_fuzzy) as score - FROM files_fts_fuzzy - WHERE files_fts_fuzzy MATCH 'def' - ORDER BY score - LIMIT 10""" - ) - results = cursor.fetchall() - assert len(results) > 0, "Should find matches in fuzzy FTS" - finally: - store.close() - - def test_fuzzy_substring_matching(self, populated_db): - """Test fuzzy search finds partial token matches with trigram.""" - store = DirIndexStore(populated_db) - store.initialize() - - try: - # Check if trigram is available - with store._get_connection() as conn: - cursor = conn.execute( - "SELECT sql FROM sqlite_master WHERE name='files_fts_fuzzy'" - ) - fts_sql = cursor.fetchone()[0] - has_trigram = 'trigram' in fts_sql.lower() - - if not has_trigram: - pytest.skip("Trigram tokenizer not available, skipping fuzzy substring test") - - # Search for partial token "func" should match "function0", "function1", etc. - cursor = conn.execute( - """SELECT full_path, bm25(files_fts_fuzzy) as score - FROM files_fts_fuzzy - WHERE files_fts_fuzzy MATCH 'func' - ORDER BY score - LIMIT 10""" - ) - results = cursor.fetchall() - - # With trigram, should find matches - assert len(results) > 0, "Fuzzy search with trigram should find partial token matches" - - # Verify results contain expected files with "function" in content - for path, score in results: - assert "file" in path # All test files named "test/fileN.py" - assert score < 0 # BM25 scores are negative - finally: - store.close() - - -class TestMigrationRecovery: - """Tests for migration failure recovery and edge cases.""" - - @pytest.fixture - def corrupted_v2_db(self): - """Create v2 database with incomplete migration state.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - conn = sqlite3.connect(db_path) - try: - # Create v2 schema with some data - conn.executescript(""" - PRAGMA user_version = 2; - - CREATE TABLE files ( - path TEXT PRIMARY KEY, - content TEXT, - language TEXT - ); - - INSERT INTO files VALUES ('test.py', 'content', 'python'); - - CREATE VIRTUAL TABLE files_fts USING fts5( - path, content, language, - content='files', content_rowid='rowid' - ); - """) - conn.commit() - finally: - conn.close() - - yield db_path - - if db_path.exists(): - db_path.unlink() - - def test_migration_preserves_data_on_failure(self, corrupted_v2_db): - """Test that data is preserved if migration encounters issues.""" - # Read original data - conn = sqlite3.connect(corrupted_v2_db) - cursor = conn.execute("SELECT path, content FROM files") - original_data = cursor.fetchall() - conn.close() - - # Attempt migration (may fail or succeed) - store = DirIndexStore(corrupted_v2_db) - try: - store.initialize() - except Exception: - # Even if migration fails, original data should be intact - pass - finally: - store.close() - - # Verify data still exists - conn = sqlite3.connect(corrupted_v2_db) - try: - # Check schema version to determine column name - cursor = conn.execute("PRAGMA user_version") - version = cursor.fetchone()[0] - - if version >= 4: - # Migration succeeded, use new column name - cursor = conn.execute("SELECT full_path, content FROM files WHERE full_path='test.py'") - else: - # Migration failed, use old column name - cursor = conn.execute("SELECT path, content FROM files WHERE path='test.py'") - - result = cursor.fetchone() - - # Data should still be there - assert result is not None, "Data should be preserved after migration attempt" - finally: - conn.close() - - def test_migration_idempotent_after_partial_failure(self, corrupted_v2_db): - """Test migration can be retried after partial failure.""" - store1 = DirIndexStore(corrupted_v2_db) - store2 = DirIndexStore(corrupted_v2_db) - - try: - # First attempt - try: - store1.initialize() - except Exception: - pass # May fail partially - - # Second attempt should succeed or fail gracefully - store2.initialize() # Should not crash - - # Verify database is in usable state - with store2._get_connection() as conn: - cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table'") - tables = [row[0] for row in cursor.fetchall()] - - # Should have files table (either old or new schema) - assert 'files' in tables - finally: - store1.close() - store2.close() - diff --git a/codex-lens/tests/test_embedder.py b/codex-lens/tests/test_embedder.py deleted file mode 100644 index 3d6850a1..00000000 --- a/codex-lens/tests/test_embedder.py +++ /dev/null @@ -1,85 +0,0 @@ -"""Tests for embedder cache concurrency.""" - -from __future__ import annotations - -import threading -import time - -import pytest - -import codexlens.semantic.embedder as embedder_module - - -def _patch_embedder_for_unit_tests(monkeypatch: pytest.MonkeyPatch) -> None: - """Make get_embedder() tests deterministic and fast (no model downloads).""" - - monkeypatch.setattr(embedder_module, "SEMANTIC_AVAILABLE", True) - monkeypatch.setattr(embedder_module, "get_optimal_providers", lambda *args, **kwargs: []) - monkeypatch.setattr(embedder_module, "is_gpu_available", lambda: False) - monkeypatch.setattr(embedder_module.Embedder, "_load_model", lambda self: None) - - -def test_embedder_instances_are_cached_and_reused(monkeypatch: pytest.MonkeyPatch) -> None: - _patch_embedder_for_unit_tests(monkeypatch) - embedder_module.clear_embedder_cache() - - first = embedder_module.get_embedder(profile="code", use_gpu=False) - second = embedder_module.get_embedder(profile="code", use_gpu=False) - - assert first is second - - -def test_concurrent_cache_access(monkeypatch: pytest.MonkeyPatch) -> None: - _patch_embedder_for_unit_tests(monkeypatch) - embedder_module.clear_embedder_cache() - - profiles = ["fast", "code", "balanced", "multilingual"] - for profile in profiles: - embedder_module.get_embedder(profile=profile, use_gpu=False) - - errors: list[BaseException] = [] - errors_lock = threading.Lock() - - def record_error(err: BaseException) -> None: - with errors_lock: - errors.append(err) - - worker_count = 20 - start_barrier = threading.Barrier(worker_count + 1) - stop_at = time.monotonic() + 1.0 - - def clear_worker() -> None: - try: - start_barrier.wait() - while time.monotonic() < stop_at: - embedder_module.clear_embedder_cache() - time.sleep(0) - except BaseException as err: - record_error(err) - - def access_worker(profile: str) -> None: - try: - start_barrier.wait() - while time.monotonic() < stop_at: - embedder_module.get_embedder(profile=profile, use_gpu=False) - except BaseException as err: - record_error(err) - - threads: list[threading.Thread] = [ - threading.Thread(target=clear_worker, name="clear-embedder-cache"), - ] - for idx in range(worker_count): - threads.append( - threading.Thread( - target=access_worker, - name=f"get-embedder-{idx}", - args=(profiles[idx % len(profiles)],), - ) - ) - - for thread in threads: - thread.start() - for thread in threads: - thread.join(timeout=10) - - assert not errors, f"Unexpected errors during concurrent access: {errors!r}" diff --git a/codex-lens/tests/test_embedding_backend_availability.py b/codex-lens/tests/test_embedding_backend_availability.py deleted file mode 100644 index 70fa5672..00000000 --- a/codex-lens/tests/test_embedding_backend_availability.py +++ /dev/null @@ -1,67 +0,0 @@ -"""Tests for embedding backend availability checks. - -These tests validate the logic used to decide whether embeddings generation -should run for a given backend (fastembed vs. litellm). -""" - -import pytest - - -def test_is_embedding_backend_available_invalid_backend(monkeypatch): - import codexlens.semantic as semantic - - ok, err = semantic.is_embedding_backend_available("nope") - assert ok is False - assert "Invalid embedding backend" in (err or "") - - -def test_is_embedding_backend_available_fastembed_true(monkeypatch): - import codexlens.semantic as semantic - - monkeypatch.setattr(semantic, "SEMANTIC_AVAILABLE", True) - ok, err = semantic.is_embedding_backend_available("fastembed") - assert ok is True - assert err is None - - -def test_is_embedding_backend_available_fastembed_false(monkeypatch): - import codexlens.semantic as semantic - - monkeypatch.setattr(semantic, "SEMANTIC_AVAILABLE", False) - monkeypatch.setattr(semantic, "_import_error", "fastembed missing") - ok, err = semantic.is_embedding_backend_available("fastembed") - assert ok is False - assert err == "fastembed missing" - - -def test_is_embedding_backend_available_litellm_true(monkeypatch): - import codexlens.semantic as semantic - - monkeypatch.setattr(semantic, "LITELLM_AVAILABLE", True) - ok, err = semantic.is_embedding_backend_available("litellm") - assert ok is True - assert err is None - - -def test_is_embedding_backend_available_litellm_false(monkeypatch): - import codexlens.semantic as semantic - - monkeypatch.setattr(semantic, "LITELLM_AVAILABLE", False) - ok, err = semantic.is_embedding_backend_available("litellm") - assert ok is False - assert "ccw-litellm not available" in (err or "") - - -def test_generate_embeddings_uses_backend_availability_gate(monkeypatch, tmp_path): - from codexlens.cli import embedding_manager - - monkeypatch.setattr( - embedding_manager, - "is_embedding_backend_available", - lambda _backend: (False, "blocked"), - ) - - result = embedding_manager.generate_embeddings(tmp_path / "_index.db", embedding_backend="litellm") - assert result["success"] is False - assert result["error"] == "blocked" - diff --git a/codex-lens/tests/test_embedding_status_root_model.py b/codex-lens/tests/test_embedding_status_root_model.py deleted file mode 100644 index 7314d205..00000000 --- a/codex-lens/tests/test_embedding_status_root_model.py +++ /dev/null @@ -1,204 +0,0 @@ -import gc -import gc -import shutil -import sqlite3 -import tempfile -import time -from pathlib import Path - -import pytest - -import codexlens.cli.embedding_manager as embedding_manager -from codexlens.cli.embedding_manager import get_embedding_stats_summary, get_embeddings_status - - -@pytest.fixture -def status_temp_dir() -> Path: - temp_path = Path(tempfile.mkdtemp()) - try: - yield temp_path - finally: - gc.collect() - for _ in range(5): - try: - if temp_path.exists(): - shutil.rmtree(temp_path) - break - except PermissionError: - time.sleep(0.1) - - -def _create_index_db(index_path: Path, files: list[str], embedded_files: list[str] | None = None) -> None: - index_path.parent.mkdir(parents=True, exist_ok=True) - with sqlite3.connect(index_path) as conn: - cursor = conn.cursor() - cursor.execute( - """ - CREATE TABLE files ( - id INTEGER PRIMARY KEY, - path TEXT NOT NULL UNIQUE, - content TEXT, - language TEXT, - hash TEXT - ) - """ - ) - cursor.executemany( - "INSERT INTO files (path, content, language, hash) VALUES (?, ?, ?, ?)", - [(file_path, "", "python", f"hash-{idx}") for idx, file_path in enumerate(files)], - ) - - if embedded_files is not None: - cursor.execute( - """ - CREATE TABLE semantic_chunks ( - id INTEGER PRIMARY KEY, - file_path TEXT NOT NULL, - content TEXT, - embedding BLOB, - metadata TEXT, - category TEXT - ) - """ - ) - cursor.executemany( - "INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category) VALUES (?, ?, ?, ?, ?)", - [(file_path, "chunk", b"vec", "{}", "code") for file_path in embedded_files], - ) - conn.commit() - - -def _create_vectors_meta_db(meta_path: Path, embedded_files: list[str], binary_vector_count: int = 0) -> None: - meta_path.parent.mkdir(parents=True, exist_ok=True) - with sqlite3.connect(meta_path) as conn: - cursor = conn.cursor() - cursor.execute( - """ - CREATE TABLE chunk_metadata ( - chunk_id INTEGER PRIMARY KEY, - file_path TEXT NOT NULL, - content TEXT, - start_line INTEGER, - end_line INTEGER, - category TEXT, - metadata TEXT, - source_index_db TEXT - ) - """ - ) - cursor.execute( - """ - CREATE TABLE binary_vectors ( - chunk_id INTEGER PRIMARY KEY, - vector BLOB NOT NULL - ) - """ - ) - cursor.executemany( - """ - INSERT INTO chunk_metadata ( - chunk_id, file_path, content, start_line, end_line, category, metadata, source_index_db - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) - """, - [ - (idx, file_path, "chunk", 1, 1, "code", "{}", str(meta_path.parent / "_index.db")) - for idx, file_path in enumerate(embedded_files, start=1) - ], - ) - cursor.executemany( - "INSERT INTO binary_vectors (chunk_id, vector) VALUES (?, ?)", - [(idx, b"\x01") for idx in range(1, binary_vector_count + 1)], - ) - conn.commit() - - -def test_root_status_does_not_inherit_child_embeddings( - monkeypatch: pytest.MonkeyPatch, status_temp_dir: Path -) -> None: - workspace = status_temp_dir / "workspace" - workspace.mkdir() - _create_index_db(workspace / "_index.db", ["a.py", "b.py"]) - _create_index_db(workspace / "child" / "_index.db", ["child.py"], embedded_files=["child.py"]) - - monkeypatch.setattr( - embedding_manager, - "_get_model_info_from_index", - lambda index_path: { - "model_profile": "fast", - "model_name": "unit-test-model", - "embedding_dim": 384, - "backend": "fastembed", - "created_at": "2026-03-13T00:00:00Z", - "updated_at": "2026-03-13T00:00:00Z", - } if index_path.parent.name == "child" else None, - ) - - status = get_embeddings_status(workspace) - assert status["success"] is True - - result = status["result"] - assert result["coverage_percent"] == 0.0 - assert result["files_with_embeddings"] == 0 - assert result["root"]["has_embeddings"] is False - assert result["model_info"] is None - assert result["subtree"]["indexes_with_embeddings"] == 1 - assert result["subtree"]["coverage_percent"] > 0 - - -def test_root_status_uses_validated_centralized_metadata(status_temp_dir: Path) -> None: - workspace = status_temp_dir / "workspace" - workspace.mkdir() - _create_index_db(workspace / "_index.db", ["a.py", "b.py"]) - _create_vectors_meta_db(workspace / "_vectors_meta.db", ["a.py"]) - (workspace / "_vectors.hnsw").write_bytes(b"hnsw") - - status = get_embeddings_status(workspace) - assert status["success"] is True - - result = status["result"] - assert result["coverage_percent"] == 50.0 - assert result["files_with_embeddings"] == 1 - assert result["total_chunks"] == 1 - assert result["root"]["has_embeddings"] is True - assert result["root"]["storage_mode"] == "centralized" - assert result["centralized"]["dense_ready"] is True - assert result["centralized"]["usable"] is True - - -def test_embedding_stats_summary_skips_ignored_artifact_indexes(status_temp_dir: Path) -> None: - workspace = status_temp_dir / "workspace" - workspace.mkdir() - _create_index_db(workspace / "_index.db", ["root.py"]) - _create_index_db(workspace / "src" / "_index.db", ["src.py"]) - _create_index_db(workspace / "dist" / "_index.db", ["bundle.py"], embedded_files=["bundle.py"]) - _create_index_db(workspace / ".workflow" / "_index.db", ["trace.py"], embedded_files=["trace.py"]) - - summary = get_embedding_stats_summary(workspace) - - assert summary["success"] is True - result = summary["result"] - assert result["total_indexes"] == 2 - assert {Path(item["path"]).relative_to(workspace).as_posix() for item in result["indexes"]} == { - "_index.db", - "src/_index.db", - } - - -def test_root_status_ignores_empty_centralized_artifacts(status_temp_dir: Path) -> None: - workspace = status_temp_dir / "workspace" - workspace.mkdir() - _create_index_db(workspace / "_index.db", ["a.py", "b.py"]) - _create_vectors_meta_db(workspace / "_vectors_meta.db", []) - (workspace / "_vectors.hnsw").write_bytes(b"hnsw") - (workspace / "_binary_vectors.mmap").write_bytes(b"mmap") - - status = get_embeddings_status(workspace) - assert status["success"] is True - - result = status["result"] - assert result["coverage_percent"] == 0.0 - assert result["files_with_embeddings"] == 0 - assert result["root"]["has_embeddings"] is False - assert result["centralized"]["chunk_metadata_rows"] == 0 - assert result["centralized"]["binary_vector_rows"] == 0 - assert result["centralized"]["usable"] is False diff --git a/codex-lens/tests/test_encoding.py b/codex-lens/tests/test_encoding.py deleted file mode 100644 index 253f82c9..00000000 --- a/codex-lens/tests/test_encoding.py +++ /dev/null @@ -1,372 +0,0 @@ -"""Tests for encoding detection module (P1). - -Tests chardet integration, UTF-8 fallback behavior, confidence thresholds, -and safe file reading with error replacement. -""" - -import tempfile -from pathlib import Path -from unittest.mock import Mock, patch - -import pytest - -from codexlens.parsers.encoding import ( - ENCODING_DETECTION_AVAILABLE, - check_encoding_available, - detect_encoding, - is_binary_file, - read_file_safe, -) - - -class TestEncodingDetectionAvailability: - """Tests for encoding detection feature availability.""" - - def test_encoding_available_flag(self): - """Test ENCODING_DETECTION_AVAILABLE flag is boolean.""" - assert isinstance(ENCODING_DETECTION_AVAILABLE, bool) - - def test_check_encoding_available_returns_tuple(self): - """Test check_encoding_available returns (available, error_message).""" - available, error_msg = check_encoding_available() - assert isinstance(available, bool) - if not available: - assert isinstance(error_msg, str) - assert "chardet" in error_msg.lower() or "install" in error_msg.lower() - else: - assert error_msg is None - - -class TestDetectEncoding: - """Tests for detect_encoding function.""" - - def test_detect_utf8_content(self): - """Test detection of UTF-8 encoded content.""" - content = "Hello, World! 你好世界".encode("utf-8") - encoding = detect_encoding(content) - # Should detect UTF-8 or use UTF-8 as fallback - assert encoding.lower() in ["utf-8", "utf8"] - - def test_detect_latin1_content(self): - """Test detection of ISO-8859-1 encoded content.""" - content = "Héllo, Wörld! Ñoño".encode("iso-8859-1") - encoding = detect_encoding(content) - # Should detect ISO-8859-1 or fallback to UTF-8 - assert isinstance(encoding, str) - assert len(encoding) > 0 - - def test_detect_gbk_content(self): - """Test detection of GBK encoded content.""" - content = "你好世界 测试文本".encode("gbk") - encoding = detect_encoding(content) - # Should detect GBK or fallback to UTF-8 - assert isinstance(encoding, str) - if ENCODING_DETECTION_AVAILABLE: - # With chardet, should detect CJK encoding or UTF-8 (chardet may detect similar encodings) - valid_encodings = ["gbk", "gb2312", "gb18030", "big5", "utf-8", "utf8", "cp949", "euc-kr", "iso-8859-1"] - assert encoding.lower() in valid_encodings, f"Got unexpected encoding: {encoding}" - else: - # Without chardet, should fallback to UTF-8 - assert encoding.lower() in ["utf-8", "utf8"] - - def test_empty_content_returns_utf8(self): - """Test empty content returns UTF-8 fallback.""" - encoding = detect_encoding(b"") - assert encoding.lower() in ["utf-8", "utf8"] - - @pytest.mark.skipif(not ENCODING_DETECTION_AVAILABLE, reason="chardet not installed") - def test_confidence_threshold_filtering(self): - """Test low-confidence detections are rejected and fallback to UTF-8.""" - # Use sys.modules to mock chardet.detect - import sys - if 'chardet' not in sys.modules: - pytest.skip("chardet not available") - - import chardet - - with patch.object(chardet, "detect") as mock_detect: - mock_detect.return_value = { - "encoding": "windows-1252", - "confidence": 0.3 # Below default threshold of 0.7 - } - content = b"some text" - encoding = detect_encoding(content, confidence_threshold=0.7) - # Should fallback to UTF-8 due to low confidence - assert encoding.lower() in ["utf-8", "utf8"] - - @pytest.mark.skipif(not ENCODING_DETECTION_AVAILABLE, reason="chardet not installed") - def test_high_confidence_accepted(self): - """Test high-confidence detections are accepted.""" - import sys - if 'chardet' not in sys.modules: - pytest.skip("chardet not available") - - import chardet - - with patch.object(chardet, "detect") as mock_detect: - mock_detect.return_value = { - "encoding": "utf-8", - "confidence": 0.95 # Above threshold - } - content = b"some text" - encoding = detect_encoding(content, confidence_threshold=0.7) - assert encoding.lower() in ["utf-8", "utf8"] - - @pytest.mark.skipif(not ENCODING_DETECTION_AVAILABLE, reason="chardet not installed") - def test_chardet_exception_fallback(self): - """Test chardet exceptions trigger UTF-8 fallback.""" - import sys - if 'chardet' not in sys.modules: - pytest.skip("chardet not available") - - import chardet - - with patch.object(chardet, "detect", side_effect=Exception("Mock error")): - content = b"some text" - encoding = detect_encoding(content) - # Should fallback gracefully - assert encoding.lower() in ["utf-8", "utf8"] - - def test_fallback_without_chardet(self): - """Test graceful fallback when chardet unavailable.""" - # Temporarily disable chardet - with patch("codexlens.parsers.encoding.ENCODING_DETECTION_AVAILABLE", False): - content = "测试内容".encode("utf-8") - encoding = detect_encoding(content) - assert encoding.lower() in ["utf-8", "utf8"] - - -class TestReadFileSafe: - """Tests for read_file_safe function.""" - - @pytest.fixture - def temp_file(self): - """Create temporary file for testing.""" - with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".txt") as f: - file_path = Path(f.name) - yield file_path - if file_path.exists(): - file_path.unlink() - - def test_read_utf8_file(self, temp_file): - """Test reading UTF-8 encoded file.""" - content_text = "Hello, World! 你好世界" - temp_file.write_bytes(content_text.encode("utf-8")) - - content, encoding = read_file_safe(temp_file) - assert content == content_text - assert encoding.lower() in ["utf-8", "utf8"] - - def test_read_gbk_file(self, temp_file): - """Test reading GBK encoded file.""" - content_text = "你好世界 测试文本" - temp_file.write_bytes(content_text.encode("gbk")) - - content, encoding = read_file_safe(temp_file) - # Should decode correctly with detected or fallback encoding - assert isinstance(content, str) - if ENCODING_DETECTION_AVAILABLE: - # With chardet, should detect GBK/GB2312/Big5 and decode correctly - # Chardet may detect Big5 for GBK content, which is acceptable - assert "你好" in content or "世界" in content or len(content) > 0 - else: - # Without chardet, UTF-8 fallback with replacement - assert isinstance(content, str) - - def test_read_latin1_file(self, temp_file): - """Test reading ISO-8859-1 encoded file.""" - content_text = "Héllo Wörld" - temp_file.write_bytes(content_text.encode("iso-8859-1")) - - content, encoding = read_file_safe(temp_file) - assert isinstance(content, str) - # Should decode with detected or fallback encoding - assert len(content) > 0 - - def test_error_replacement_preserves_structure(self, temp_file): - """Test errors='replace' preserves file structure with unmappable bytes.""" - # Create file with invalid UTF-8 sequence - invalid_utf8 = b"Valid text\xFF\xFEInvalid bytes\x00More text" - temp_file.write_bytes(invalid_utf8) - - content, encoding = read_file_safe(temp_file) - # Should decode with replacement character - assert "Valid text" in content - assert "More text" in content - # Should contain replacement characters (�) for invalid bytes - assert isinstance(content, str) - - def test_max_detection_bytes_parameter(self, temp_file): - """Test max_detection_bytes limits encoding detection sample size.""" - # Create large file - large_content = ("测试内容 " * 10000).encode("utf-8") # ~60KB - temp_file.write_bytes(large_content) - - # Use small detection sample - content, encoding = read_file_safe(temp_file, max_detection_bytes=1000) - assert isinstance(content, str) - assert len(content) > 0 - - def test_confidence_threshold_parameter(self, temp_file): - """Test confidence_threshold parameter affects detection.""" - content_text = "Sample text for encoding detection" - temp_file.write_bytes(content_text.encode("utf-8")) - - # High threshold - content_high, encoding_high = read_file_safe(temp_file, confidence_threshold=0.9) - assert isinstance(content_high, str) - - # Low threshold - content_low, encoding_low = read_file_safe(temp_file, confidence_threshold=0.5) - assert isinstance(content_low, str) - - def test_read_nonexistent_file_raises(self): - """Test reading nonexistent file raises OSError.""" - with pytest.raises(OSError): - read_file_safe(Path("/nonexistent/path/file.txt")) - - def test_read_directory_raises(self, tmp_path): - """Test reading directory raises IsADirectoryError.""" - with pytest.raises((IsADirectoryError, OSError)): - read_file_safe(tmp_path) - - def test_read_empty_file(self, temp_file): - """Test reading empty file returns empty string.""" - temp_file.write_bytes(b"") - content, encoding = read_file_safe(temp_file) - assert content == "" - assert encoding.lower() in ["utf-8", "utf8"] - - -class TestIsBinaryFile: - """Tests for is_binary_file function.""" - - @pytest.fixture - def temp_file(self): - """Create temporary file for testing.""" - with tempfile.NamedTemporaryFile(mode="wb", delete=False) as f: - file_path = Path(f.name) - yield file_path - if file_path.exists(): - file_path.unlink() - - def test_text_file_not_binary(self, temp_file): - """Test text file is not classified as binary.""" - temp_file.write_bytes(b"This is a text file\nWith multiple lines\n") - assert not is_binary_file(temp_file) - - def test_binary_file_with_null_bytes(self, temp_file): - """Test file with >30% null bytes is classified as binary.""" - # Create file with high null byte ratio - binary_content = b"\x00" * 5000 + b"text" * 100 - temp_file.write_bytes(binary_content) - assert is_binary_file(temp_file) - - def test_binary_file_with_non_text_chars(self, temp_file): - """Test file with high non-text character ratio is binary.""" - # Create file with non-printable characters - binary_content = bytes(range(0, 256)) * 50 - temp_file.write_bytes(binary_content) - # Should be classified as binary due to high non-text ratio - result = is_binary_file(temp_file) - # May or may not be binary depending on exact ratio - assert isinstance(result, bool) - - def test_empty_file_not_binary(self, temp_file): - """Test empty file is not classified as binary.""" - temp_file.write_bytes(b"") - assert not is_binary_file(temp_file) - - def test_utf8_text_not_binary(self, temp_file): - """Test UTF-8 text file is not classified as binary.""" - temp_file.write_bytes("你好世界 Hello World".encode("utf-8")) - assert not is_binary_file(temp_file) - - def test_sample_size_parameter(self, temp_file): - """Test sample_size parameter limits bytes checked.""" - # Create large file with text at start, binary later - content = b"Text content" * 1000 + b"\x00" * 10000 - temp_file.write_bytes(content) - - # Small sample should see only text - assert not is_binary_file(temp_file, sample_size=100) - - # Large sample should see binary content - result = is_binary_file(temp_file, sample_size=20000) - assert isinstance(result, bool) - - def test_tabs_newlines_not_counted_as_non_text(self, temp_file): - """Test tabs and newlines are not counted as non-text characters.""" - content = b"Line 1\nLine 2\tTabbed\rCarriage return\n" - temp_file.write_bytes(content) - assert not is_binary_file(temp_file) - - -@pytest.mark.parametrize("encoding,test_content", [ - ("utf-8", "Hello 世界 🌍"), - ("gbk", "你好世界"), - ("iso-8859-1", "Héllo Wörld"), - ("windows-1252", "Smart quotes test"), -]) -class TestEncodingParameterized: - """Parameterized tests for various encodings.""" - - def test_detect_and_decode(self, encoding, test_content): - """Test detection and decoding roundtrip for various encodings.""" - # Skip if encoding not supported - try: - encoded = test_content.encode(encoding) - except (UnicodeEncodeError, LookupError): - pytest.skip(f"Encoding {encoding} not supported") - - detected = detect_encoding(encoded) - assert isinstance(detected, str) - - # Decode with detected encoding (with fallback) - try: - decoded = encoded.decode(detected, errors='replace') - assert isinstance(decoded, str) - except (UnicodeDecodeError, LookupError): - # Fallback to UTF-8 - decoded = encoded.decode('utf-8', errors='replace') - assert isinstance(decoded, str) - - -@pytest.mark.skipif(ENCODING_DETECTION_AVAILABLE, reason="Test fallback behavior when chardet unavailable") -class TestWithoutChardet: - """Tests for behavior when chardet is not available.""" - - def test_all_functions_work_without_chardet(self): - """Test all encoding functions work gracefully without chardet.""" - content = b"Test content" - - # Should all return UTF-8 fallback - encoding = detect_encoding(content) - assert encoding.lower() in ["utf-8", "utf8"] - - available, error = check_encoding_available() - assert not available - assert error is not None - - -@pytest.mark.skipif(not ENCODING_DETECTION_AVAILABLE, reason="Requires chardet") -class TestWithChardet: - """Tests for behavior when chardet is available.""" - - def test_chardet_available_flag(self): - """Test ENCODING_DETECTION_AVAILABLE is True when chardet installed.""" - assert ENCODING_DETECTION_AVAILABLE is True - - def test_check_encoding_available(self): - """Test check_encoding_available returns success.""" - available, error = check_encoding_available() - assert available is True - assert error is None - - def test_detect_encoding_uses_chardet(self): - """Test detect_encoding uses chardet when available.""" - content = "你好世界".encode("gbk") - encoding = detect_encoding(content) - # Should detect GBK or related encoding - assert isinstance(encoding, str) - assert len(encoding) > 0 diff --git a/codex-lens/tests/test_enrichment.py b/codex-lens/tests/test_enrichment.py deleted file mode 100644 index 8b07b385..00000000 --- a/codex-lens/tests/test_enrichment.py +++ /dev/null @@ -1,234 +0,0 @@ -"""Tests for search result enrichment with relationship data.""" -import sqlite3 -import tempfile -import time -from pathlib import Path - -import pytest - -from codexlens.search.enrichment import RelationshipEnricher - - -@pytest.fixture -def mock_db(): - """Create a mock database with symbols and relationships.""" - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "_index.db" - conn = sqlite3.connect(str(db_path)) - cursor = conn.cursor() - - # Create schema - cursor.execute(''' - CREATE TABLE symbols ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - qualified_name TEXT NOT NULL, - name TEXT NOT NULL, - kind TEXT NOT NULL, - file_path TEXT NOT NULL, - start_line INTEGER NOT NULL, - end_line INTEGER NOT NULL - ) - ''') - cursor.execute(''' - CREATE TABLE symbol_relationships ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - source_symbol_id INTEGER NOT NULL, - target_symbol_fqn TEXT NOT NULL, - relationship_type TEXT NOT NULL, - file_path TEXT NOT NULL, - line INTEGER, - FOREIGN KEY (source_symbol_id) REFERENCES symbols(id) - ) - ''') - - # Insert test data - cursor.execute(''' - INSERT INTO symbols (qualified_name, name, kind, file_path, start_line, end_line) - VALUES ('module.main', 'main', 'function', 'module.py', 1, 10) - ''') - main_id = cursor.lastrowid - - cursor.execute(''' - INSERT INTO symbols (qualified_name, name, kind, file_path, start_line, end_line) - VALUES ('module.helper', 'helper', 'function', 'module.py', 12, 20) - ''') - helper_id = cursor.lastrowid - - cursor.execute(''' - INSERT INTO symbols (qualified_name, name, kind, file_path, start_line, end_line) - VALUES ('utils.fetch', 'fetch', 'function', 'utils.py', 1, 5) - ''') - fetch_id = cursor.lastrowid - - # main calls helper - cursor.execute(''' - INSERT INTO symbol_relationships (source_symbol_id, target_symbol_fqn, relationship_type, file_path, line) - VALUES (?, 'helper', 'calls', 'module.py', 5) - ''', (main_id,)) - - # main calls fetch - cursor.execute(''' - INSERT INTO symbol_relationships (source_symbol_id, target_symbol_fqn, relationship_type, file_path, line) - VALUES (?, 'utils.fetch', 'calls', 'module.py', 6) - ''', (main_id,)) - - # helper imports os - cursor.execute(''' - INSERT INTO symbol_relationships (source_symbol_id, target_symbol_fqn, relationship_type, file_path, line) - VALUES (?, 'os', 'imports', 'module.py', 13) - ''', (helper_id,)) - - conn.commit() - conn.close() - - yield db_path - - -class TestRelationshipEnricher: - """Test suite for RelationshipEnricher.""" - - def test_enrich_with_relationships(self, mock_db): - """Test enriching results with valid relationships.""" - with RelationshipEnricher(mock_db) as enricher: - results = [ - {"path": "module.py", "score": 0.9, "excerpt": "def main():", "symbol": "main"}, - {"path": "module.py", "score": 0.8, "excerpt": "def helper():", "symbol": "helper"}, - ] - - enriched = enricher.enrich(results, limit=10) - - # Check main's relationships - main_result = enriched[0] - assert "relationships" in main_result - main_rels = main_result["relationships"] - assert len(main_rels) >= 2 - - # Verify outgoing relationships - outgoing = [r for r in main_rels if r["direction"] == "outgoing"] - targets = [r["target"] for r in outgoing] - assert "helper" in targets or any("helper" in t for t in targets) - - # Check helper's relationships - helper_result = enriched[1] - assert "relationships" in helper_result - helper_rels = helper_result["relationships"] - assert len(helper_rels) >= 1 - - # Verify incoming relationships (main calls helper) - incoming = [r for r in helper_rels if r["direction"] == "incoming"] - assert len(incoming) >= 1 - assert incoming[0]["type"] == "called_by" - - def test_enrich_missing_symbol(self, mock_db): - """Test graceful handling of missing symbols.""" - with RelationshipEnricher(mock_db) as enricher: - results = [ - {"path": "unknown.py", "score": 0.9, "excerpt": "code", "symbol": "nonexistent"}, - ] - - enriched = enricher.enrich(results, limit=10) - - # Should return empty relationships, not crash - assert "relationships" in enriched[0] - assert enriched[0]["relationships"] == [] - - def test_enrich_no_symbol_name(self, mock_db): - """Test handling results without symbol names.""" - with RelationshipEnricher(mock_db) as enricher: - results = [ - {"path": "module.py", "score": 0.9, "excerpt": "code", "symbol": None}, - ] - - enriched = enricher.enrich(results, limit=10) - - assert "relationships" in enriched[0] - assert enriched[0]["relationships"] == [] - - def test_enrich_performance(self, mock_db): - """Test that enrichment is fast (<100ms for 10 results).""" - with RelationshipEnricher(mock_db) as enricher: - results = [ - {"path": "module.py", "score": 0.9, "excerpt": f"code{i}", "symbol": "main"} - for i in range(10) - ] - - start = time.perf_counter() - enricher.enrich(results, limit=10) - elapsed_ms = (time.perf_counter() - start) * 1000 - - assert elapsed_ms < 100, f"Enrichment took {elapsed_ms:.1f}ms, expected < 100ms" - - def test_enrich_limit(self, mock_db): - """Test that limit parameter is respected.""" - with RelationshipEnricher(mock_db) as enricher: - results = [ - {"path": "module.py", "score": 0.9, "symbol": "main"}, - {"path": "module.py", "score": 0.8, "symbol": "helper"}, - {"path": "utils.py", "score": 0.7, "symbol": "fetch"}, - ] - - # Only enrich first 2 - enriched = enricher.enrich(results, limit=2) - - assert "relationships" in enriched[0] - assert "relationships" in enriched[1] - # Third result should NOT have relationships key - assert "relationships" not in enriched[2] - - def test_connection_failure_graceful(self): - """Test graceful handling when database doesn't exist.""" - nonexistent = Path("/nonexistent/path/_index.db") - with RelationshipEnricher(nonexistent) as enricher: - results = [{"path": "test.py", "score": 0.9, "symbol": "test"}] - enriched = enricher.enrich(results) - - # Should return original results without crashing - assert len(enriched) == 1 - - def test_incoming_type_conversion(self, mock_db): - """Test that relationship types are correctly converted for incoming.""" - with RelationshipEnricher(mock_db) as enricher: - results = [ - {"path": "module.py", "score": 0.9, "symbol": "helper"}, - ] - - enriched = enricher.enrich(results) - rels = enriched[0]["relationships"] - - incoming = [r for r in rels if r["direction"] == "incoming"] - if incoming: - # calls should become called_by - assert incoming[0]["type"] == "called_by" - - def test_context_manager(self, mock_db): - """Test that context manager properly opens and closes connections.""" - enricher = RelationshipEnricher(mock_db) - assert enricher.db_conn is not None - - enricher.close() - assert enricher.db_conn is None - - # Using context manager - with RelationshipEnricher(mock_db) as e: - assert e.db_conn is not None - assert e.db_conn is None - - def test_relationship_data_structure(self, mock_db): - """Test that relationship data has correct structure.""" - with RelationshipEnricher(mock_db) as enricher: - results = [{"path": "module.py", "score": 0.9, "symbol": "main"}] - enriched = enricher.enrich(results) - - rels = enriched[0]["relationships"] - for rel in rels: - # All relationships should have required fields - assert "type" in rel - assert "direction" in rel - assert "file" in rel - assert rel["direction"] in ["outgoing", "incoming"] - - # Outgoing should have target, incoming should have source - if rel["direction"] == "outgoing": - assert "target" in rel - else: - assert "source" in rel diff --git a/codex-lens/tests/test_entities.py b/codex-lens/tests/test_entities.py deleted file mode 100644 index e038a70e..00000000 --- a/codex-lens/tests/test_entities.py +++ /dev/null @@ -1,245 +0,0 @@ -"""Tests for CodexLens entity models.""" - -import pytest -from pydantic import ValidationError - -from codexlens.entities import IndexedFile, SearchResult, SemanticChunk, Symbol - - -class TestSymbol: - """Tests for Symbol entity.""" - - def test_create_valid_symbol(self): - """Test creating a valid symbol.""" - symbol = Symbol(name="hello", kind="function", range=(1, 10)) - assert symbol.name == "hello" - assert symbol.kind == "function" - assert symbol.range == (1, 10) - - def test_symbol_range_validation(self): - """Test that range values must be valid.""" - # Range must have start >= 1 - with pytest.raises(ValidationError): - Symbol(name="test", kind="function", range=(0, 5)) - - # Range must have end >= start - with pytest.raises(ValidationError): - Symbol(name="test", kind="function", range=(5, 3)) - - # Both values must be >= 1 - with pytest.raises(ValidationError): - Symbol(name="test", kind="function", range=(-1, 5)) - - def test_symbol_name_required(self): - """Test that name is required and non-empty.""" - with pytest.raises(ValidationError): - Symbol(name="", kind="function", range=(1, 1)) - - def test_symbol_kind_required(self): - """Test that kind is required and non-empty.""" - with pytest.raises(ValidationError): - Symbol(name="test", kind="", range=(1, 1)) - - def test_symbol_equal_range(self): - """Test symbol with equal start and end line.""" - symbol = Symbol(name="one_liner", kind="function", range=(5, 5)) - assert symbol.range == (5, 5) - - -class TestSemanticChunk: - """Tests for SemanticChunk entity.""" - - def test_create_chunk_without_embedding(self): - """Test creating a chunk without embedding.""" - chunk = SemanticChunk(content="def hello(): pass") - assert chunk.content == "def hello(): pass" - assert chunk.embedding is None - assert chunk.metadata == {} - - def test_create_chunk_with_embedding(self): - """Test creating a chunk with embedding.""" - embedding = [0.1, 0.2, 0.3, 0.4] - chunk = SemanticChunk(content="some code", embedding=embedding) - assert chunk.embedding == embedding - - def test_chunk_with_metadata(self): - """Test creating a chunk with metadata.""" - metadata = {"file": "test.py", "language": "python", "line": 10} - chunk = SemanticChunk(content="code", metadata=metadata) - assert chunk.metadata == metadata - - def test_chunk_content_required(self): - """Test that content is required and non-empty.""" - with pytest.raises(ValidationError): - SemanticChunk(content="") - - def test_chunk_embedding_validation(self): - """Test that embedding cannot be empty list when provided.""" - with pytest.raises(ValidationError): - SemanticChunk(content="code", embedding=[]) - - def test_chunk_embedding_with_floats(self): - """Test embedding with various float values.""" - embedding = [0.0, 1.0, -0.5, 0.123456789] - chunk = SemanticChunk(content="code", embedding=embedding) - assert chunk.embedding == embedding - - def test_chunk_zero_vector_validation(self): - """Test that zero vector embeddings are rejected.""" - with pytest.raises(ValidationError) as exc: - SemanticChunk(content="code", embedding=[0.0, 0.0, 0.0, 0.0]) - assert "zero vector" in str(exc.value).lower() - - def test_chunk_near_zero_vector_validation(self): - """Test that near-zero vector embeddings are rejected.""" - with pytest.raises(ValidationError) as exc: - SemanticChunk(content="code", embedding=[1e-11, 1e-11, 1e-11]) - assert "zero vector" in str(exc.value).lower() - - def test_chunk_small_nonzero_vector_validation(self): - """Test that small but non-zero embeddings are allowed.""" - embedding = [0.001, 0.001, 0.001] - chunk = SemanticChunk(content="code", embedding=embedding) - assert chunk.embedding == embedding - - -class TestIndexedFile: - """Tests for IndexedFile entity.""" - - def test_create_empty_indexed_file(self): - """Test creating an indexed file with no symbols or chunks.""" - indexed = IndexedFile(path="/test/file.py", language="python") - assert indexed.path == "/test/file.py" - assert indexed.language == "python" - assert indexed.symbols == [] - assert indexed.chunks == [] - - def test_create_indexed_file_with_symbols(self): - """Test creating an indexed file with symbols.""" - symbols = [ - Symbol(name="MyClass", kind="class", range=(1, 10)), - Symbol(name="my_func", kind="function", range=(12, 20)), - ] - indexed = IndexedFile( - path="/test/file.py", - language="python", - symbols=symbols, - ) - assert len(indexed.symbols) == 2 - assert indexed.symbols[0].name == "MyClass" - - def test_create_indexed_file_with_chunks(self): - """Test creating an indexed file with chunks.""" - chunks = [ - SemanticChunk(content="chunk 1", metadata={"line": 1}), - SemanticChunk(content="chunk 2", metadata={"line": 10}), - ] - indexed = IndexedFile( - path="/test/file.py", - language="python", - chunks=chunks, - ) - assert len(indexed.chunks) == 2 - - def test_indexed_file_path_strip(self): - """Test that path is stripped of whitespace.""" - indexed = IndexedFile(path=" /test/file.py ", language="python") - assert indexed.path == "/test/file.py" - - def test_indexed_file_language_strip(self): - """Test that language is stripped of whitespace.""" - indexed = IndexedFile(path="/test/file.py", language=" python ") - assert indexed.language == "python" - - def test_indexed_file_path_required(self): - """Test that path is required and non-blank.""" - with pytest.raises(ValidationError): - IndexedFile(path="", language="python") - - with pytest.raises(ValidationError): - IndexedFile(path=" ", language="python") - - def test_indexed_file_language_required(self): - """Test that language is required and non-blank.""" - with pytest.raises(ValidationError): - IndexedFile(path="/test/file.py", language="") - - -class TestSearchResult: - """Tests for SearchResult entity.""" - - def test_create_minimal_search_result(self): - """Test creating a minimal search result.""" - result = SearchResult(path="/test/file.py", score=0.95) - assert result.path == "/test/file.py" - assert result.score == 0.95 - assert result.excerpt is None - assert result.symbol is None - assert result.chunk is None - assert result.metadata == {} - - def test_create_full_search_result(self): - """Test creating a search result with all fields.""" - symbol = Symbol(name="test", kind="function", range=(1, 5)) - chunk = SemanticChunk(content="test code") - result = SearchResult( - path="/test/file.py", - score=0.88, - excerpt="...matching code...", - symbol=symbol, - chunk=chunk, - metadata={"match_type": "fts"}, - ) - assert result.excerpt == "...matching code..." - assert result.symbol.name == "test" - assert result.chunk.content == "test code" - - def test_search_result_score_validation(self): - """Test that score must be >= 0.""" - with pytest.raises(ValidationError): - SearchResult(path="/test/file.py", score=-0.1) - - def test_search_result_zero_score(self): - """Test that zero score is valid.""" - result = SearchResult(path="/test/file.py", score=0.0) - assert result.score == 0.0 - - def test_search_result_path_required(self): - """Test that path is required and non-empty.""" - with pytest.raises(ValidationError): - SearchResult(path="", score=0.5) - - -class TestEntitySerialization: - """Tests for entity serialization.""" - - def test_symbol_model_dump(self): - """Test Symbol serialization.""" - symbol = Symbol(name="test", kind="function", range=(1, 10)) - data = symbol.model_dump() - assert data == { - "name": "test", - "kind": "function", - "range": (1, 10), - "file": None, - } - - def test_indexed_file_model_dump(self): - """Test IndexedFile serialization.""" - indexed = IndexedFile( - path="/test.py", - language="python", - symbols=[Symbol(name="foo", kind="function", range=(1, 1))], - ) - data = indexed.model_dump() - assert data["path"] == "/test.py" - assert data["language"] == "python" - assert len(data["symbols"]) == 1 - - def test_search_result_model_dump(self): - """Test SearchResult serialization.""" - result = SearchResult(path="/test.py", score=0.5, excerpt="test") - data = result.model_dump() - assert data["path"] == "/test.py" - assert data["score"] == 0.5 - assert data["excerpt"] == "test" diff --git a/codex-lens/tests/test_errors.py b/codex-lens/tests/test_errors.py deleted file mode 100644 index 09394a5a..00000000 --- a/codex-lens/tests/test_errors.py +++ /dev/null @@ -1,165 +0,0 @@ -"""Tests for CodexLens error classes.""" - -import pytest - -from codexlens.errors import ( - CodexLensError, - ConfigError, - ParseError, - SearchError, - StorageError, -) - - -class TestErrorHierarchy: - """Tests for error class hierarchy.""" - - def test_codexlens_error_is_exception(self): - """Test that CodexLensError is an Exception.""" - assert issubclass(CodexLensError, Exception) - - def test_config_error_inherits_from_base(self): - """Test ConfigError inherits from CodexLensError.""" - assert issubclass(ConfigError, CodexLensError) - - def test_parse_error_inherits_from_base(self): - """Test ParseError inherits from CodexLensError.""" - assert issubclass(ParseError, CodexLensError) - - def test_storage_error_inherits_from_base(self): - """Test StorageError inherits from CodexLensError.""" - assert issubclass(StorageError, CodexLensError) - - def test_search_error_inherits_from_base(self): - """Test SearchError inherits from CodexLensError.""" - assert issubclass(SearchError, CodexLensError) - - -class TestErrorMessages: - """Tests for error message handling.""" - - def test_codexlens_error_with_message(self): - """Test creating CodexLensError with message.""" - error = CodexLensError("Something went wrong") - assert str(error) == "Something went wrong" - - def test_config_error_with_message(self): - """Test creating ConfigError with message.""" - error = ConfigError("Invalid configuration") - assert str(error) == "Invalid configuration" - - def test_parse_error_with_message(self): - """Test creating ParseError with message.""" - error = ParseError("Failed to parse file.py") - assert str(error) == "Failed to parse file.py" - - def test_storage_error_with_message(self): - """Test creating StorageError with message.""" - error = StorageError("Database connection failed") - assert str(error) == "Database connection failed" - - def test_search_error_with_message(self): - """Test creating SearchError with message.""" - error = SearchError("FTS query syntax error") - assert str(error) == "FTS query syntax error" - - -class TestErrorRaising: - """Tests for raising and catching errors.""" - - def test_catch_specific_error(self): - """Test catching specific error type.""" - with pytest.raises(ConfigError): - raise ConfigError("test") - - def test_catch_base_error(self): - """Test catching base error type catches all subtypes.""" - with pytest.raises(CodexLensError): - raise ConfigError("test") - - with pytest.raises(CodexLensError): - raise ParseError("test") - - with pytest.raises(CodexLensError): - raise StorageError("test") - - with pytest.raises(CodexLensError): - raise SearchError("test") - - def test_error_not_caught_as_wrong_type(self): - """Test that errors aren't caught as wrong type.""" - with pytest.raises(ConfigError): - try: - raise ConfigError("config issue") - except ParseError: - pass # This should not catch ConfigError - - -class TestErrorChaining: - """Tests for error chaining.""" - - def test_error_with_cause(self): - """Test error chaining with __cause__.""" - original = ValueError("original error") - try: - raise StorageError("storage failed") from original - except StorageError as e: - assert e.__cause__ is original - - def test_nested_error_handling(self): - """Test nested error handling pattern.""" - def inner_function(): - raise ValueError("inner error") - - def outer_function(): - try: - inner_function() - except ValueError as e: - raise ParseError("outer error") from e - - with pytest.raises(ParseError) as exc_info: - outer_function() - - assert exc_info.value.__cause__ is not None - assert isinstance(exc_info.value.__cause__, ValueError) - - -class TestErrorUsagePatterns: - """Tests for common error usage patterns.""" - - def test_error_in_context_manager(self): - """Test error handling in context manager.""" - class FakeStore: - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - return False # Don't suppress exceptions - - def query(self): - raise StorageError("query failed") - - with pytest.raises(StorageError): - with FakeStore() as store: - store.query() - - def test_error_comparison(self): - """Test error instance comparison.""" - error1 = ConfigError("test") - error2 = ConfigError("test") - # Different instances, even with same message - assert error1 is not error2 - # But same string representation - assert str(error1) == str(error2) - - def test_empty_error_message(self): - """Test error with empty message.""" - error = CodexLensError("") - assert str(error) == "" - - def test_error_with_format_args(self): - """Test error with formatted message.""" - path = "/test/file.py" - error = ParseError(f"Failed to parse {path}: syntax error on line 10") - assert "/test/file.py" in str(error) - assert "line 10" in str(error) diff --git a/codex-lens/tests/test_file_cache.py b/codex-lens/tests/test_file_cache.py deleted file mode 100644 index 72223a7f..00000000 --- a/codex-lens/tests/test_file_cache.py +++ /dev/null @@ -1,224 +0,0 @@ -"""Tests for CodexLens file cache.""" - -import tempfile -from pathlib import Path - -import pytest - -from codexlens.storage.file_cache import FileCache - - -class TestFileCache: - """Tests for FileCache class.""" - - def test_create_cache(self): - """Test creating a FileCache instance.""" - with tempfile.TemporaryDirectory() as tmpdir: - cache = FileCache(cache_path=Path(tmpdir)) - assert cache.cache_path == Path(tmpdir) - - def test_store_and_load_mtime(self): - """Test storing and loading mtime.""" - with tempfile.TemporaryDirectory() as tmpdir: - cache = FileCache(cache_path=Path(tmpdir)) - file_path = Path("/test/file.py") - mtime = 1234567890.123 - - cache.store_mtime(file_path, mtime) - loaded = cache.load_mtime(file_path) - - assert loaded == mtime - - def test_load_nonexistent_mtime(self): - """Test loading mtime for uncached file returns None.""" - with tempfile.TemporaryDirectory() as tmpdir: - cache = FileCache(cache_path=Path(tmpdir)) - file_path = Path("/nonexistent/file.py") - - loaded = cache.load_mtime(file_path) - - assert loaded is None - - def test_update_mtime(self): - """Test updating existing mtime.""" - with tempfile.TemporaryDirectory() as tmpdir: - cache = FileCache(cache_path=Path(tmpdir)) - file_path = Path("/test/file.py") - - cache.store_mtime(file_path, 1000.0) - cache.store_mtime(file_path, 2000.0) - loaded = cache.load_mtime(file_path) - - assert loaded == 2000.0 - - def test_multiple_files(self): - """Test caching multiple files.""" - with tempfile.TemporaryDirectory() as tmpdir: - cache = FileCache(cache_path=Path(tmpdir)) - - files = { - Path("/test/a.py"): 1000.0, - Path("/test/b.py"): 2000.0, - Path("/test/c.py"): 3000.0, - } - - for path, mtime in files.items(): - cache.store_mtime(path, mtime) - - for path, expected_mtime in files.items(): - loaded = cache.load_mtime(path) - assert loaded == expected_mtime - - -class TestFileCacheKeyGeneration: - """Tests for cache key generation.""" - - def test_key_for_simple_path(self): - """Test key generation for simple path.""" - with tempfile.TemporaryDirectory() as tmpdir: - cache = FileCache(cache_path=Path(tmpdir)) - key = cache._key_for(Path("test.py")) - assert key.endswith(".mtime") - - def test_key_for_path_with_slashes(self): - """Test key generation for path with slashes.""" - with tempfile.TemporaryDirectory() as tmpdir: - cache = FileCache(cache_path=Path(tmpdir)) - key = cache._key_for(Path("/path/to/file.py")) - assert "/" not in key - assert key.endswith(".mtime") - - def test_key_for_windows_path(self): - """Test key generation for Windows-style path.""" - with tempfile.TemporaryDirectory() as tmpdir: - cache = FileCache(cache_path=Path(tmpdir)) - key = cache._key_for(Path("C:\\Users\\test\\file.py")) - assert "\\" not in key - assert ":" not in key - assert key.endswith(".mtime") - - def test_different_paths_different_keys(self): - """Test that different paths produce different keys.""" - with tempfile.TemporaryDirectory() as tmpdir: - cache = FileCache(cache_path=Path(tmpdir)) - key1 = cache._key_for(Path("/test/a.py")) - key2 = cache._key_for(Path("/test/b.py")) - assert key1 != key2 - - -class TestFileCacheDirectoryCreation: - """Tests for cache directory creation.""" - - def test_creates_cache_directory(self): - """Test that cache directory is created when storing.""" - with tempfile.TemporaryDirectory() as tmpdir: - cache_path = Path(tmpdir) / "new_cache_dir" - cache = FileCache(cache_path=cache_path) - - assert not cache_path.exists() - - cache.store_mtime(Path("/test.py"), 1000.0) - - assert cache_path.exists() - - def test_nested_cache_directory(self): - """Test creating nested cache directory.""" - with tempfile.TemporaryDirectory() as tmpdir: - cache_path = Path(tmpdir) / "a" / "b" / "c" / "cache" - cache = FileCache(cache_path=cache_path) - - cache.store_mtime(Path("/test.py"), 1000.0) - - assert cache_path.exists() - - -class TestFileCacheEdgeCases: - """Edge case tests for FileCache.""" - - def test_mtime_precision(self): - """Test that mtime precision is preserved.""" - with tempfile.TemporaryDirectory() as tmpdir: - cache = FileCache(cache_path=Path(tmpdir)) - precise_mtime = 1234567890.123456789 - - cache.store_mtime(Path("/test.py"), precise_mtime) - loaded = cache.load_mtime(Path("/test.py")) - - # Should preserve reasonable precision - assert abs(loaded - precise_mtime) < 0.0001 - - def test_zero_mtime(self): - """Test storing zero mtime.""" - with tempfile.TemporaryDirectory() as tmpdir: - cache = FileCache(cache_path=Path(tmpdir)) - - cache.store_mtime(Path("/test.py"), 0.0) - loaded = cache.load_mtime(Path("/test.py")) - - assert loaded == 0.0 - - def test_negative_mtime(self): - """Test storing negative mtime (edge case).""" - with tempfile.TemporaryDirectory() as tmpdir: - cache = FileCache(cache_path=Path(tmpdir)) - - cache.store_mtime(Path("/test.py"), -1000.0) - loaded = cache.load_mtime(Path("/test.py")) - - assert loaded == -1000.0 - - def test_large_mtime(self): - """Test storing large mtime value.""" - with tempfile.TemporaryDirectory() as tmpdir: - cache = FileCache(cache_path=Path(tmpdir)) - large_mtime = 9999999999.999 - - cache.store_mtime(Path("/test.py"), large_mtime) - loaded = cache.load_mtime(Path("/test.py")) - - assert loaded == large_mtime - - def test_unicode_path(self): - """Test path with unicode characters.""" - with tempfile.TemporaryDirectory() as tmpdir: - cache = FileCache(cache_path=Path(tmpdir)) - unicode_path = Path("/测试/文件.py") - - cache.store_mtime(unicode_path, 1000.0) - loaded = cache.load_mtime(unicode_path) - - assert loaded == 1000.0 - - def test_load_corrupted_cache_file(self): - """Test loading corrupted cache file returns None.""" - with tempfile.TemporaryDirectory() as tmpdir: - cache = FileCache(cache_path=Path(tmpdir)) - file_path = Path("/test.py") - - # Create a corrupted cache file - cache.store_mtime(file_path, 1000.0) - key = cache._key_for(file_path) - (Path(tmpdir) / key).write_text("not a number") - - # Should return None for corrupted data - loaded = cache.load_mtime(file_path) - assert loaded is None - - -class TestFileCachePersistence: - """Tests for cache persistence across instances.""" - - def test_cache_persists_across_instances(self): - """Test that cache data persists when creating new instance.""" - with tempfile.TemporaryDirectory() as tmpdir: - cache_path = Path(tmpdir) - - # Store with first instance - cache1 = FileCache(cache_path=cache_path) - cache1.store_mtime(Path("/test.py"), 1234.0) - - # Load with second instance - cache2 = FileCache(cache_path=cache_path) - loaded = cache2.load_mtime(Path("/test.py")) - - assert loaded == 1234.0 diff --git a/codex-lens/tests/test_global_graph_expander.py b/codex-lens/tests/test_global_graph_expander.py deleted file mode 100644 index 37fa9371..00000000 --- a/codex-lens/tests/test_global_graph_expander.py +++ /dev/null @@ -1,323 +0,0 @@ -"""Tests for GlobalGraphExpander.""" - -import tempfile -from pathlib import Path - -import pytest - -from codexlens.entities import ( - CodeRelationship, - RelationshipType, - SearchResult, - Symbol, -) -from codexlens.search.global_graph_expander import ( - DECAY_FACTORS, - DEFAULT_DECAY, - GlobalGraphExpander, -) -from codexlens.storage.global_index import GlobalSymbolIndex - - -@pytest.fixture() -def temp_dir(): - tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) - yield Path(tmpdir.name) - try: - tmpdir.cleanup() - except (PermissionError, OSError): - pass - - -def _setup_global_index(root: Path) -> GlobalSymbolIndex: - """Create a GlobalSymbolIndex with test symbols and relationships.""" - db_path = root / "test_global.db" - gsi = GlobalSymbolIndex(db_path, project_id=1) - gsi.initialize() - - # Files in different directories (cross-directory scenario) - file_a = str((root / "pkg_a" / "module_a.py").resolve()) - file_b = str((root / "pkg_b" / "module_b.py").resolve()) - file_c = str((root / "pkg_c" / "module_c.py").resolve()) - index_path = str((root / "indexes" / "_index.db").resolve()) - - symbols_a = [ - Symbol(name="ClassA", kind="class", range=(1, 20), file=file_a), - Symbol(name="func_a", kind="function", range=(22, 30), file=file_a), - ] - symbols_b = [ - Symbol(name="ClassB", kind="class", range=(1, 15), file=file_b), - ] - symbols_c = [ - Symbol(name="helper_c", kind="function", range=(1, 10), file=file_c), - ] - - gsi.update_file_symbols(file_a, symbols_a, index_path=index_path) - gsi.update_file_symbols(file_b, symbols_b, index_path=index_path) - gsi.update_file_symbols(file_c, symbols_c, index_path=index_path) - - # Relationships: - # ClassA --imports--> ClassB (cross-directory) - # ClassA --calls--> helper_c (cross-directory) - # ClassB --inherits--> ClassA (cross-directory) - relationships_a = [ - CodeRelationship( - source_symbol="ClassA", - target_symbol="ClassB", - relationship_type=RelationshipType.IMPORTS, - source_file=file_a, - target_file=file_b, - source_line=2, - ), - CodeRelationship( - source_symbol="ClassA", - target_symbol="helper_c", - relationship_type=RelationshipType.CALL, - source_file=file_a, - target_file=file_c, - source_line=10, - ), - ] - relationships_b = [ - CodeRelationship( - source_symbol="ClassB", - target_symbol="ClassA", - relationship_type=RelationshipType.INHERITS, - source_file=file_b, - target_file=file_a, - source_line=1, - ), - ] - - gsi.update_file_relationships(file_a, relationships_a) - gsi.update_file_relationships(file_b, relationships_b) - - return gsi - - -def test_expand_returns_related_results(temp_dir: Path) -> None: - """expand() should return related symbols from global relationships.""" - gsi = _setup_global_index(temp_dir) - try: - expander = GlobalGraphExpander(gsi) - - file_a = str((temp_dir / "pkg_a" / "module_a.py").resolve()) - base_results = [ - SearchResult( - path=file_a, - score=1.0, - excerpt=None, - content=None, - start_line=1, - end_line=20, - symbol_name="ClassA", - symbol_kind="class", - ), - ] - - related = expander.expand(base_results, top_n=10, max_related=50) - - assert len(related) > 0 - # All results should have static_graph source metadata - for r in related: - assert r.metadata.get("source") == "static_graph" - # Should find ClassB and/or helper_c as related symbols - related_symbols = {r.symbol_name for r in related} - assert len(related_symbols) > 0 - finally: - gsi.close() - - -def test_score_decay_by_relationship_type(temp_dir: Path) -> None: - """Score decay factors should be: IMPORTS=0.4, INHERITS=0.5, CALLS=0.3.""" - # Verify the constants - assert DECAY_FACTORS["imports"] == 0.4 - assert DECAY_FACTORS["inherits"] == 0.5 - assert DECAY_FACTORS["calls"] == 0.3 - assert DEFAULT_DECAY == 0.3 - - gsi = _setup_global_index(temp_dir) - try: - expander = GlobalGraphExpander(gsi) - - file_a = str((temp_dir / "pkg_a" / "module_a.py").resolve()) - base_results = [ - SearchResult( - path=file_a, - score=1.0, - excerpt=None, - content=None, - start_line=1, - end_line=20, - symbol_name="ClassA", - symbol_kind="class", - ), - ] - - related = expander.expand(base_results, top_n=10, max_related=50) - - # Check that scores use decay factors - for r in related: - rel_type = r.metadata.get("relationship_type") - if rel_type: - expected_decay = DECAY_FACTORS.get(rel_type, DEFAULT_DECAY) - # Score should be base_score * decay (possibly * 0.8 for unresolved) - assert r.score <= 1.0 * expected_decay + 0.01 - assert r.score > 0.0 - finally: - gsi.close() - - -def test_expand_with_no_relationships_returns_empty(temp_dir: Path) -> None: - """expand() should return empty list when no relationships exist.""" - db_path = temp_dir / "empty_global.db" - gsi = GlobalSymbolIndex(db_path, project_id=1) - gsi.initialize() - - try: - # Add a symbol but no relationships - file_x = str((temp_dir / "isolated.py").resolve()) - index_path = str((temp_dir / "idx.db").resolve()) - gsi.update_file_symbols( - file_x, - [Symbol(name="IsolatedFunc", kind="function", range=(1, 5), file=file_x)], - index_path=index_path, - ) - - expander = GlobalGraphExpander(gsi) - base_results = [ - SearchResult( - path=file_x, - score=0.9, - excerpt=None, - content=None, - start_line=1, - end_line=5, - symbol_name="IsolatedFunc", - symbol_kind="function", - ), - ] - - related = expander.expand(base_results, top_n=10, max_related=50) - assert related == [] - finally: - gsi.close() - - -def test_expand_deduplicates_against_input(temp_dir: Path) -> None: - """expand() should not include results already present in input.""" - gsi = _setup_global_index(temp_dir) - try: - expander = GlobalGraphExpander(gsi) - - file_a = str((temp_dir / "pkg_a" / "module_a.py").resolve()) - file_b = str((temp_dir / "pkg_b" / "module_b.py").resolve()) - - # Include both ClassA and ClassB in input - ClassB should be deduplicated - base_results = [ - SearchResult( - path=file_a, - score=1.0, - excerpt=None, - content=None, - start_line=1, - end_line=20, - symbol_name="ClassA", - symbol_kind="class", - ), - SearchResult( - path=file_b, - score=0.8, - excerpt=None, - content=None, - start_line=1, - end_line=15, - symbol_name="ClassB", - symbol_kind="class", - ), - ] - - related = expander.expand(base_results, top_n=10, max_related=50) - - # No related result should match (path, symbol_name, start_line) - # of any input result - input_keys = {(r.path, r.symbol_name, r.start_line) for r in base_results} - for r in related: - assert (r.path, r.symbol_name, r.start_line) not in input_keys - finally: - gsi.close() - - -def test_resolve_target_with_double_colon_format(temp_dir: Path) -> None: - """_resolve_target_to_file should handle 'file_path::symbol_name' format.""" - gsi = _setup_global_index(temp_dir) - try: - expander = GlobalGraphExpander(gsi) - - file_b = str((temp_dir / "pkg_b" / "module_b.py").resolve()) - target_qname = f"{file_b}::ClassB" - - result = expander._resolve_target_to_file(target_qname) - assert result is not None - resolved_file, start_line, end_line = result - assert resolved_file == file_b - # ClassB is at range (1, 15) - assert start_line == 1 - assert end_line == 15 - finally: - gsi.close() - - -def test_resolve_target_with_dot_notation(temp_dir: Path) -> None: - """_resolve_target_to_file should handle 'module.ClassName' dot notation.""" - gsi = _setup_global_index(temp_dir) - try: - expander = GlobalGraphExpander(gsi) - - # "pkg.ClassB" - leaf name "ClassB" should be found via search - result = expander._resolve_target_to_file("pkg.ClassB") - assert result is not None - resolved_file, start_line, end_line = result - # Should resolve to ClassB's file - file_b = str((temp_dir / "pkg_b" / "module_b.py").resolve()) - assert resolved_file == file_b - assert start_line == 1 - assert end_line == 15 - finally: - gsi.close() - - -def test_expand_empty_results_returns_empty(temp_dir: Path) -> None: - """expand() with empty input should return empty list.""" - db_path = temp_dir / "empty.db" - gsi = GlobalSymbolIndex(db_path, project_id=1) - gsi.initialize() - try: - expander = GlobalGraphExpander(gsi) - assert expander.expand([]) == [] - finally: - gsi.close() - - -def test_expand_results_without_symbol_names_returns_empty(temp_dir: Path) -> None: - """expand() should skip results without symbol_name.""" - db_path = temp_dir / "nosym.db" - gsi = GlobalSymbolIndex(db_path, project_id=1) - gsi.initialize() - try: - expander = GlobalGraphExpander(gsi) - base_results = [ - SearchResult( - path="/some/file.py", - score=1.0, - excerpt="some text", - content=None, - start_line=1, - end_line=5, - symbol_name=None, - symbol_kind=None, - ), - ] - assert expander.expand(base_results) == [] - finally: - gsi.close() diff --git a/codex-lens/tests/test_global_index.py b/codex-lens/tests/test_global_index.py deleted file mode 100644 index b548cfa2..00000000 --- a/codex-lens/tests/test_global_index.py +++ /dev/null @@ -1,293 +0,0 @@ -import sqlite3 -import tempfile -import time -from concurrent.futures import ThreadPoolExecutor -from pathlib import Path -from unittest.mock import MagicMock - -import pytest - -from codexlens.config import Config -from codexlens.entities import Symbol -from codexlens.errors import StorageError -from codexlens.search.chain_search import ChainSearchEngine -from codexlens.storage.global_index import GlobalSymbolIndex -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - - -@pytest.fixture() -def temp_paths(): - tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) - root = Path(tmpdir.name) - yield root - try: - tmpdir.cleanup() - except (PermissionError, OSError): - pass - - -def test_add_symbol(temp_paths: Path): - db_path = temp_paths / "indexes" / "_global_symbols.db" - index_path = temp_paths / "indexes" / "_index.db" - file_path = temp_paths / "src" / "a.py" - - index_path.parent.mkdir(parents=True, exist_ok=True) - index_path.write_text("", encoding="utf-8") - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("class AuthManager:\n pass\n", encoding="utf-8") - - with GlobalSymbolIndex(db_path, project_id=1) as store: - store.add_symbol( - Symbol(name="AuthManager", kind="class", range=(1, 2)), - file_path=file_path, - index_path=index_path, - ) - - matches = store.search("AuthManager", kind="class", limit=10, prefix_mode=True) - assert len(matches) == 1 - assert matches[0].name == "AuthManager" - assert matches[0].file == str(file_path.resolve()) - - # Schema version safety: newer schema versions should be rejected. - bad_db = temp_paths / "indexes" / "_global_symbols_bad.db" - bad_db.parent.mkdir(parents=True, exist_ok=True) - conn = sqlite3.connect(bad_db) - conn.execute("PRAGMA user_version = 999") - conn.close() - - with pytest.raises(StorageError): - GlobalSymbolIndex(bad_db, project_id=1).initialize() - - -def test_search_symbols(temp_paths: Path): - db_path = temp_paths / "indexes" / "_global_symbols.db" - index_path = temp_paths / "indexes" / "_index.db" - file_path = temp_paths / "src" / "mod.py" - - index_path.parent.mkdir(parents=True, exist_ok=True) - index_path.write_text("", encoding="utf-8") - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("def authenticate():\n pass\n", encoding="utf-8") - - with GlobalSymbolIndex(db_path, project_id=7) as store: - store.add_symbol( - Symbol(name="authenticate", kind="function", range=(1, 2)), - file_path=file_path, - index_path=index_path, - ) - - locations = store.search_symbols("auth", kind="function", limit=10, prefix_mode=True) - assert locations - assert any(p.endswith("mod.py") for p, _ in locations) - assert any(rng == (1, 2) for _, rng in locations) - - -def test_update_file_symbols(temp_paths: Path): - db_path = temp_paths / "indexes" / "_global_symbols.db" - file_path = temp_paths / "src" / "mod.py" - index_path = temp_paths / "indexes" / "_index.db" - - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("def a():\n pass\n", encoding="utf-8") - index_path.parent.mkdir(parents=True, exist_ok=True) - index_path.write_text("", encoding="utf-8") - - with GlobalSymbolIndex(db_path, project_id=7) as store: - store.update_file_symbols( - file_path=file_path, - symbols=[ - Symbol(name="old_func", kind="function", range=(1, 2)), - Symbol(name="Other", kind="class", range=(10, 20)), - ], - index_path=index_path, - ) - assert any(s.name == "old_func" for s in store.search("old_", prefix_mode=True)) - - store.update_file_symbols( - file_path=file_path, - symbols=[Symbol(name="new_func", kind="function", range=(3, 4))], - index_path=index_path, - ) - assert not any(s.name == "old_func" for s in store.search("old_", prefix_mode=True)) - assert any(s.name == "new_func" for s in store.search("new_", prefix_mode=True)) - - # Backward-compatible path: index_path can be omitted after it's been established. - store.update_file_symbols( - file_path=file_path, - symbols=[Symbol(name="new_func2", kind="function", range=(5, 6))], - index_path=None, - ) - assert any(s.name == "new_func2" for s in store.search("new_func2", prefix_mode=True)) - - # New file + symbols without index_path should raise. - missing_index_file = temp_paths / "src" / "new_file.py" - with pytest.raises(StorageError): - store.update_file_symbols( - file_path=missing_index_file, - symbols=[Symbol(name="must_fail", kind="function", range=(1, 1))], - index_path=None, - ) - - deleted = store.delete_file_symbols(file_path) - assert deleted > 0 - - -def test_incremental_updates(temp_paths: Path, monkeypatch): - db_path = temp_paths / "indexes" / "_global_symbols.db" - file_path = temp_paths / "src" / "same.py" - idx_a = temp_paths / "indexes" / "a" / "_index.db" - idx_b = temp_paths / "indexes" / "b" / "_index.db" - - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("class AuthManager:\n pass\n", encoding="utf-8") - idx_a.parent.mkdir(parents=True, exist_ok=True) - idx_a.write_text("", encoding="utf-8") - idx_b.parent.mkdir(parents=True, exist_ok=True) - idx_b.write_text("", encoding="utf-8") - - with GlobalSymbolIndex(db_path, project_id=42) as store: - sym = Symbol(name="AuthManager", kind="class", range=(1, 2)) - store.add_symbol(sym, file_path=file_path, index_path=idx_a) - store.add_symbol(sym, file_path=file_path, index_path=idx_b) - - # prefix_mode=False exercises substring matching. - assert store.search("Manager", prefix_mode=False) - - conn = sqlite3.connect(db_path) - row = conn.execute( - """ - SELECT index_path - FROM global_symbols - WHERE project_id=? AND symbol_name=? AND symbol_kind=? AND file_path=? - """, - (42, "AuthManager", "class", str(file_path.resolve())), - ).fetchone() - conn.close() - - assert row is not None - assert str(Path(row[0]).resolve()) == str(idx_b.resolve()) - - # Migration path coverage: simulate a future schema version and an older DB version. - migrating_db = temp_paths / "indexes" / "_global_symbols_migrate.db" - migrating_db.parent.mkdir(parents=True, exist_ok=True) - conn = sqlite3.connect(migrating_db) - conn.execute("PRAGMA user_version = 1") - conn.close() - - monkeypatch.setattr(GlobalSymbolIndex, "SCHEMA_VERSION", 2) - GlobalSymbolIndex(migrating_db, project_id=1).initialize() - - -def test_concurrent_access(temp_paths: Path): - db_path = temp_paths / "indexes" / "_global_symbols.db" - index_path = temp_paths / "indexes" / "_index.db" - file_path = temp_paths / "src" / "a.py" - - index_path.parent.mkdir(parents=True, exist_ok=True) - index_path.write_text("", encoding="utf-8") - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("class A:\n pass\n", encoding="utf-8") - - with GlobalSymbolIndex(db_path, project_id=1) as store: - def add_many(worker_id: int): - for i in range(50): - store.add_symbol( - Symbol(name=f"Sym{worker_id}_{i}", kind="class", range=(1, 2)), - file_path=file_path, - index_path=index_path, - ) - - with ThreadPoolExecutor(max_workers=8) as ex: - list(ex.map(add_many, range(8))) - - matches = store.search("Sym", kind="class", limit=1000, prefix_mode=True) - assert len(matches) >= 200 - - -def test_chain_search_integration(temp_paths: Path): - project_root = temp_paths / "project" - project_root.mkdir(parents=True, exist_ok=True) - - index_root = temp_paths / "indexes" - mapper = PathMapper(index_root=index_root) - index_db_path = mapper.source_to_index_db(project_root) - index_db_path.parent.mkdir(parents=True, exist_ok=True) - index_db_path.write_text("", encoding="utf-8") - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - project_info = registry.register_project(project_root, mapper.source_to_index_dir(project_root)) - - global_db_path = project_info.index_root / GlobalSymbolIndex.DEFAULT_DB_NAME - with GlobalSymbolIndex(global_db_path, project_id=project_info.id) as global_index: - file_path = project_root / "auth.py" - global_index.update_file_symbols( - file_path=file_path, - symbols=[ - Symbol(name="AuthManager", kind="class", range=(1, 10)), - Symbol(name="authenticate", kind="function", range=(12, 20)), - ], - index_path=index_db_path, - ) - - config = Config(data_dir=temp_paths / "data", global_symbol_index_enabled=True) - engine = ChainSearchEngine(registry, mapper, config=config) - engine._search_symbols_parallel = MagicMock(side_effect=AssertionError("should not traverse chain")) - - symbols = engine.search_symbols("Auth", project_root) - assert any(s.name == "AuthManager" for s in symbols) - registry.close() - - -def test_disabled_fallback(temp_paths: Path): - project_root = temp_paths / "project" - project_root.mkdir(parents=True, exist_ok=True) - - index_root = temp_paths / "indexes" - mapper = PathMapper(index_root=index_root) - index_db_path = mapper.source_to_index_db(project_root) - index_db_path.parent.mkdir(parents=True, exist_ok=True) - index_db_path.write_text("", encoding="utf-8") - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - registry.register_project(project_root, mapper.source_to_index_dir(project_root)) - - config = Config(data_dir=temp_paths / "data", global_symbol_index_enabled=False) - engine = ChainSearchEngine(registry, mapper, config=config) - engine._collect_index_paths = MagicMock(return_value=[index_db_path]) - engine._search_symbols_parallel = MagicMock( - return_value=[Symbol(name="FallbackSymbol", kind="function", range=(1, 2))] - ) - - symbols = engine.search_symbols("Fallback", project_root) - assert any(s.name == "FallbackSymbol" for s in symbols) - assert engine._search_symbols_parallel.called - registry.close() - - -def test_performance_benchmark(temp_paths: Path): - db_path = temp_paths / "indexes" / "_global_symbols.db" - index_path = temp_paths / "indexes" / "_index.db" - file_path = temp_paths / "src" / "perf.py" - - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("class AuthManager:\n pass\n", encoding="utf-8") - index_path.parent.mkdir(parents=True, exist_ok=True) - index_path.write_text("", encoding="utf-8") - - with GlobalSymbolIndex(db_path, project_id=1) as store: - for i in range(500): - store.add_symbol( - Symbol(name=f"AuthManager{i}", kind="class", range=(1, 2)), - file_path=file_path, - index_path=index_path, - ) - - start = time.perf_counter() - results = store.search("AuthManager", kind="class", limit=50, prefix_mode=True) - elapsed_ms = (time.perf_counter() - start) * 1000 - - assert elapsed_ms < 100.0 - assert results diff --git a/codex-lens/tests/test_global_relationships.py b/codex-lens/tests/test_global_relationships.py deleted file mode 100644 index 43da4c1b..00000000 --- a/codex-lens/tests/test_global_relationships.py +++ /dev/null @@ -1,507 +0,0 @@ -"""Tests for global_relationships table in GlobalSymbolIndex.""" - -import sqlite3 -import tempfile -import time -from pathlib import Path - -import pytest - -from codexlens.entities import CodeRelationship, RelationshipType -from codexlens.storage.global_index import GlobalSymbolIndex - - -@pytest.fixture() -def temp_paths(): - tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) - root = Path(tmpdir.name) - yield root - try: - tmpdir.cleanup() - except (PermissionError, OSError): - pass - - -def _make_rel( - source_symbol: str, - target_symbol: str, - rel_type: RelationshipType = RelationshipType.CALL, - source_file: str = "src/a.py", - target_file: str | None = None, - source_line: int = 1, -) -> CodeRelationship: - return CodeRelationship( - source_symbol=source_symbol, - target_symbol=target_symbol, - relationship_type=rel_type, - source_file=source_file, - target_file=target_file, - source_line=source_line, - ) - - -# ------------------------------------------------------------------ -# Schema creation (fresh DB) -# ------------------------------------------------------------------ - - -def test_fresh_schema_creates_relationships_table(temp_paths: Path): - """New DB at SCHEMA_VERSION=2 should have global_relationships table.""" - db_path = temp_paths / "indexes" / "_global_symbols.db" - - with GlobalSymbolIndex(db_path, project_id=1) as store: - conn = store._get_connection() - tables = { - row[0] - for row in conn.execute( - "SELECT name FROM sqlite_master WHERE type='table'" - ).fetchall() - } - assert "global_relationships" in tables - assert "global_symbols" in tables - - # Verify indexes exist - indexes = { - row[0] - for row in conn.execute( - "SELECT name FROM sqlite_master WHERE type='index'" - ).fetchall() - } - assert "idx_global_rel_project_target" in indexes - assert "idx_global_rel_project_source" in indexes - - -def test_schema_version_is_2(temp_paths: Path): - db_path = temp_paths / "indexes" / "_global_symbols.db" - with GlobalSymbolIndex(db_path, project_id=1) as store: - conn = store._get_connection() - version = conn.execute("PRAGMA user_version").fetchone()[0] - assert version == 2 - - -# ------------------------------------------------------------------ -# Migration v1 -> v2 -# ------------------------------------------------------------------ - - -def test_migration_v1_to_v2(temp_paths: Path): - """A v1 database should gain the global_relationships table on upgrade.""" - db_path = temp_paths / "indexes" / "_global_symbols.db" - db_path.parent.mkdir(parents=True, exist_ok=True) - - # Simulate a v1 database: create global_symbols table + set version=1. - conn = sqlite3.connect(str(db_path)) - conn.execute( - """ - CREATE TABLE IF NOT EXISTS global_symbols ( - id INTEGER PRIMARY KEY, - project_id INTEGER NOT NULL, - symbol_name TEXT NOT NULL, - symbol_kind TEXT NOT NULL, - file_path TEXT NOT NULL, - start_line INTEGER, - end_line INTEGER, - index_path TEXT NOT NULL, - UNIQUE(project_id, symbol_name, symbol_kind, file_path, start_line, end_line) - ) - """ - ) - conn.execute("PRAGMA user_version = 1") - conn.commit() - conn.close() - - # Now open with the new code -- migration should fire. - with GlobalSymbolIndex(db_path, project_id=1) as store: - conn = store._get_connection() - version = conn.execute("PRAGMA user_version").fetchone()[0] - assert version == 2 - - tables = { - row[0] - for row in conn.execute( - "SELECT name FROM sqlite_master WHERE type='table'" - ).fetchall() - } - assert "global_relationships" in tables - - -def test_migration_idempotent(temp_paths: Path): - """Running migration twice should not fail (CREATE TABLE IF NOT EXISTS).""" - db_path = temp_paths / "indexes" / "_global_symbols.db" - - # First init - store = GlobalSymbolIndex(db_path, project_id=1) - store.initialize() - store.close() - - # Second init on same DB -- should be a no-op. - store2 = GlobalSymbolIndex(db_path, project_id=1) - store2.initialize() - store2.close() - - -# ------------------------------------------------------------------ -# update_file_relationships -# ------------------------------------------------------------------ - - -def test_update_file_relationships_insert(temp_paths: Path): - db_path = temp_paths / "indexes" / "_global_symbols.db" - file_path = temp_paths / "src" / "auth.py" - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("", encoding="utf-8") - - rels = [ - _make_rel("login", "validate_token", source_file="src/auth.py", source_line=10), - _make_rel("login", "hash_password", source_file="src/auth.py", source_line=15), - _make_rel("AuthManager", "BaseManager", RelationshipType.INHERITS, "src/auth.py", source_line=1), - ] - - with GlobalSymbolIndex(db_path, project_id=1) as store: - store.update_file_relationships(file_path, rels) - - # Verify rows exist - conn = store._get_connection() - count = conn.execute( - "SELECT COUNT(*) FROM global_relationships WHERE project_id=1" - ).fetchone()[0] - assert count == 3 - - -def test_update_file_relationships_replaces_atomically(temp_paths: Path): - """Second call should delete old rows and insert new ones.""" - db_path = temp_paths / "indexes" / "_global_symbols.db" - file_path = temp_paths / "src" / "mod.py" - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("", encoding="utf-8") - - old_rels = [_make_rel("foo", "bar", source_file="src/mod.py", source_line=5)] - new_rels = [ - _make_rel("baz", "qux", source_file="src/mod.py", source_line=10), - _make_rel("baz", "quux", source_file="src/mod.py", source_line=11), - ] - - with GlobalSymbolIndex(db_path, project_id=1) as store: - store.update_file_relationships(file_path, old_rels) - store.update_file_relationships(file_path, new_rels) - - conn = store._get_connection() - rows = conn.execute( - "SELECT source_symbol FROM global_relationships WHERE project_id=1 ORDER BY source_line" - ).fetchall() - names = [r[0] for r in rows] - assert "foo" not in names - assert "baz" in names - assert len(rows) == 2 - - -def test_update_file_relationships_empty_clears(temp_paths: Path): - """Passing empty list should delete all relationships for the file.""" - db_path = temp_paths / "indexes" / "_global_symbols.db" - file_path = temp_paths / "src" / "x.py" - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("", encoding="utf-8") - - with GlobalSymbolIndex(db_path, project_id=1) as store: - store.update_file_relationships( - file_path, - [_make_rel("a", "b", source_file="src/x.py")], - ) - store.update_file_relationships(file_path, []) - - conn = store._get_connection() - count = conn.execute( - "SELECT COUNT(*) FROM global_relationships WHERE project_id=1" - ).fetchone()[0] - assert count == 0 - - -# ------------------------------------------------------------------ -# query_by_target -# ------------------------------------------------------------------ - - -def test_query_by_target_exact(temp_paths: Path): - db_path = temp_paths / "indexes" / "_global_symbols.db" - file_path = temp_paths / "src" / "a.py" - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("", encoding="utf-8") - - rels = [ - _make_rel("caller", "TargetClass", source_file="src/a.py", source_line=10), - _make_rel("caller2", "TargetClassExtra", source_file="src/a.py", source_line=20), - ] - - with GlobalSymbolIndex(db_path, project_id=1) as store: - store.update_file_relationships(file_path, rels) - - # Exact match - results = store.query_by_target("TargetClass", prefix_mode=False) - assert len(results) == 1 - src_file, src_sym, rel_type, line = results[0] - assert src_sym == "caller" - assert rel_type == "calls" - assert line == 10 - - -def test_query_by_target_prefix(temp_paths: Path): - db_path = temp_paths / "indexes" / "_global_symbols.db" - file_path = temp_paths / "src" / "a.py" - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("", encoding="utf-8") - - rels = [ - _make_rel("c1", "TargetClass", source_file="src/a.py", source_line=10), - _make_rel("c2", "TargetClassExtra", source_file="src/a.py", source_line=20), - _make_rel("c3", "Unrelated", source_file="src/a.py", source_line=30), - ] - - with GlobalSymbolIndex(db_path, project_id=1) as store: - store.update_file_relationships(file_path, rels) - - # Prefix match should return both Target* rows - results = store.query_by_target("TargetClass", prefix_mode=True) - assert len(results) == 2 - symbols = {r[1] for r in results} - assert symbols == {"c1", "c2"} - - -def test_query_by_target_cross_directory(temp_paths: Path): - """Relationships from different files can be queried by the same target.""" - db_path = temp_paths / "indexes" / "_global_symbols.db" - file_a = temp_paths / "src" / "a.py" - file_b = temp_paths / "lib" / "b.py" - for f in (file_a, file_b): - f.parent.mkdir(parents=True, exist_ok=True) - f.write_text("", encoding="utf-8") - - with GlobalSymbolIndex(db_path, project_id=1) as store: - store.update_file_relationships( - file_a, - [_make_rel("funcA", "SharedTarget", source_file="src/a.py", source_line=5)], - ) - store.update_file_relationships( - file_b, - [_make_rel("funcB", "SharedTarget", source_file="lib/b.py", source_line=8)], - ) - - results = store.query_by_target("SharedTarget", prefix_mode=False) - assert len(results) == 2 - files = {r[0] for r in results} - assert str(file_a.resolve()) in files - assert str(file_b.resolve()) in files - - -# ------------------------------------------------------------------ -# query_relationships_for_symbols -# ------------------------------------------------------------------ - - -def test_query_relationships_for_symbols_source_match(temp_paths: Path): - db_path = temp_paths / "indexes" / "_global_symbols.db" - file_path = temp_paths / "src" / "mod.py" - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("", encoding="utf-8") - - rels = [ - _make_rel("MyClass", "BaseClass", RelationshipType.INHERITS, "src/mod.py", source_line=1), - _make_rel("helper", "utils", RelationshipType.IMPORTS, "src/mod.py", source_line=2), - ] - - with GlobalSymbolIndex(db_path, project_id=1) as store: - store.update_file_relationships(file_path, rels) - - # Query by source_symbol name - rows = store.query_relationships_for_symbols(["MyClass"]) - assert len(rows) >= 1 - assert any(r["source_symbol"] == "MyClass" for r in rows) - - -def test_query_relationships_for_symbols_target_match(temp_paths: Path): - db_path = temp_paths / "indexes" / "_global_symbols.db" - file_path = temp_paths / "src" / "mod.py" - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("", encoding="utf-8") - - rels = [ - _make_rel("caller", "TargetFunc", source_file="src/mod.py", source_line=5), - ] - - with GlobalSymbolIndex(db_path, project_id=1) as store: - store.update_file_relationships(file_path, rels) - - # Query by target name -- should match via LIKE %TargetFunc - rows = store.query_relationships_for_symbols(["TargetFunc"]) - assert len(rows) >= 1 - assert any(r["target_qualified_name"] == "TargetFunc" for r in rows) - - -def test_query_relationships_for_symbols_empty_list(temp_paths: Path): - db_path = temp_paths / "indexes" / "_global_symbols.db" - - with GlobalSymbolIndex(db_path, project_id=1) as store: - rows = store.query_relationships_for_symbols([]) - assert rows == [] - - -def test_query_relationships_for_symbols_qualified_target(temp_paths: Path): - """A qualified target like 'lib/b.py::BaseClass' should still match 'BaseClass'.""" - db_path = temp_paths / "indexes" / "_global_symbols.db" - file_path = temp_paths / "src" / "a.py" - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("", encoding="utf-8") - - rel = CodeRelationship( - source_symbol="Child", - target_symbol="BaseClass", - relationship_type=RelationshipType.INHERITS, - source_file="src/a.py", - target_file="lib/b.py", - source_line=1, - ) - - with GlobalSymbolIndex(db_path, project_id=1) as store: - store.update_file_relationships(file_path, [rel]) - - # The qualified name is "lib/b.py::BaseClass" - # query_relationships_for_symbols uses LIKE %BaseClass which should match - rows = store.query_relationships_for_symbols(["BaseClass"]) - assert len(rows) == 1 - assert rows[0]["target_qualified_name"] == "lib/b.py::BaseClass" - - -# ------------------------------------------------------------------ -# delete_file_relationships -# ------------------------------------------------------------------ - - -def test_delete_file_relationships(temp_paths: Path): - db_path = temp_paths / "indexes" / "_global_symbols.db" - file_path = temp_paths / "src" / "a.py" - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("", encoding="utf-8") - - with GlobalSymbolIndex(db_path, project_id=1) as store: - store.update_file_relationships( - file_path, - [ - _make_rel("f1", "t1", source_file="src/a.py", source_line=1), - _make_rel("f2", "t2", source_file="src/a.py", source_line=2), - ], - ) - - deleted = store.delete_file_relationships(file_path) - assert deleted == 2 - - conn = store._get_connection() - count = conn.execute( - "SELECT COUNT(*) FROM global_relationships WHERE project_id=1" - ).fetchone()[0] - assert count == 0 - - -def test_delete_file_relationships_no_rows(temp_paths: Path): - db_path = temp_paths / "indexes" / "_global_symbols.db" - nonexistent = temp_paths / "src" / "nope.py" - - with GlobalSymbolIndex(db_path, project_id=1) as store: - deleted = store.delete_file_relationships(nonexistent) - assert deleted == 0 - - -# ------------------------------------------------------------------ -# Project isolation -# ------------------------------------------------------------------ - - -def test_project_isolation(temp_paths: Path): - """Relationships from different project_ids should not leak.""" - db_path = temp_paths / "indexes" / "_global_symbols.db" - file_path = temp_paths / "src" / "a.py" - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("", encoding="utf-8") - - store1 = GlobalSymbolIndex(db_path, project_id=1) - store1.initialize() - store2 = GlobalSymbolIndex(db_path, project_id=2) - # store2 reuses the same DB; schema already created. - - store1.update_file_relationships( - file_path, - [_make_rel("a", "SharedTarget", source_file="src/a.py")], - ) - store2.update_file_relationships( - file_path, - [_make_rel("b", "SharedTarget", source_file="src/a.py")], - ) - - results1 = store1.query_by_target("SharedTarget", prefix_mode=False) - results2 = store2.query_by_target("SharedTarget", prefix_mode=False) - assert len(results1) == 1 - assert results1[0][1] == "a" - assert len(results2) == 1 - assert results2[0][1] == "b" - - store1.close() - store2.close() - - -# ------------------------------------------------------------------ -# Performance benchmarks -# ------------------------------------------------------------------ - - -def test_update_file_relationships_100_rows_under_50ms(temp_paths: Path): - """Batch insert of 100 relationships should complete in < 50ms.""" - db_path = temp_paths / "indexes" / "_global_symbols.db" - file_path = temp_paths / "src" / "perf.py" - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("", encoding="utf-8") - - rels = [ - _make_rel(f"src_{i}", f"tgt_{i}", source_file="src/perf.py", source_line=i + 1) - for i in range(100) - ] - - with GlobalSymbolIndex(db_path, project_id=1) as store: - start = time.perf_counter() - store.update_file_relationships(file_path, rels) - elapsed_ms = (time.perf_counter() - start) * 1000 - assert elapsed_ms < 50.0, f"Took {elapsed_ms:.1f}ms, expected < 50ms" - - -def test_query_by_target_exact_under_5ms(temp_paths: Path): - """Exact-match query should complete in < 5ms with 500 rows.""" - db_path = temp_paths / "indexes" / "_global_symbols.db" - file_path = temp_paths / "src" / "perf.py" - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("", encoding="utf-8") - - rels = [ - _make_rel(f"src_{i}", f"Target_{i}", source_file="src/perf.py", source_line=i + 1) - for i in range(500) - ] - - with GlobalSymbolIndex(db_path, project_id=1) as store: - store.update_file_relationships(file_path, rels) - - start = time.perf_counter() - results = store.query_by_target("Target_250", prefix_mode=False) - elapsed_ms = (time.perf_counter() - start) * 1000 - assert elapsed_ms < 5.0, f"Took {elapsed_ms:.1f}ms, expected < 5ms" - assert len(results) == 1 - - -# ------------------------------------------------------------------ -# _build_qualified_name -# ------------------------------------------------------------------ - - -def test_build_qualified_name_with_target_file(): - rel = _make_rel("src", "tgt", target_file="lib/utils.py") - assert GlobalSymbolIndex._build_qualified_name(rel) == "lib/utils.py::tgt" - - -def test_build_qualified_name_without_target_file(): - rel = _make_rel("src", "tgt", target_file=None) - assert GlobalSymbolIndex._build_qualified_name(rel) == "tgt" diff --git a/codex-lens/tests/test_global_symbol_index.py b/codex-lens/tests/test_global_symbol_index.py deleted file mode 100644 index b82d708b..00000000 --- a/codex-lens/tests/test_global_symbol_index.py +++ /dev/null @@ -1,192 +0,0 @@ -import sqlite3 -import tempfile -import time -from pathlib import Path -from unittest.mock import MagicMock - -import pytest - -from codexlens.config import Config -from codexlens.entities import Symbol -from codexlens.search.chain_search import ChainSearchEngine -from codexlens.storage.dir_index import DirIndexStore -from codexlens.storage.global_index import GlobalSymbolIndex -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - - -@pytest.fixture() -def temp_paths(): - tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) - root = Path(tmpdir.name) - yield root - try: - tmpdir.cleanup() - except (PermissionError, OSError): - pass - - -def test_global_symbol_index_add_and_search_under_50ms(temp_paths: Path): - db_path = temp_paths / "indexes" / "_global_symbols.db" - file_path = temp_paths / "src" / "a.py" - index_path = temp_paths / "indexes" / "_index.db" - - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("class AuthManager:\n pass\n", encoding="utf-8") - index_path.parent.mkdir(parents=True, exist_ok=True) - index_path.write_text("", encoding="utf-8") - - store = GlobalSymbolIndex(db_path, project_id=1) - store.initialize() - - # Insert enough rows to ensure index usage, still small enough to be fast. - for i in range(200): - store.add_symbol( - Symbol(name=f"AuthManager{i}", kind="class", range=(1, 2)), - file_path=file_path, - index_path=index_path, - ) - - start = time.perf_counter() - results = store.search("AuthManager", kind="class", limit=50, prefix_mode=True) - elapsed_ms = (time.perf_counter() - start) * 1000 - - assert elapsed_ms < 50.0 - assert len(results) >= 1 - assert all(r.kind == "class" for r in results) - assert all((r.file or "").endswith("a.py") for r in results) - - locations = store.search_symbols("AuthManager", kind="class", limit=50, prefix_mode=True) - assert locations - assert all(isinstance(p, str) and isinstance(rng, tuple) for p, rng in locations) - - -def test_update_file_symbols_replaces_atomically(temp_paths: Path): - db_path = temp_paths / "indexes" / "_global_symbols.db" - file_path = temp_paths / "src" / "mod.py" - index_path = temp_paths / "indexes" / "_index.db" - - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text("def a():\n pass\n", encoding="utf-8") - index_path.parent.mkdir(parents=True, exist_ok=True) - index_path.write_text("", encoding="utf-8") - - store = GlobalSymbolIndex(db_path, project_id=7) - store.initialize() - - store.update_file_symbols( - file_path=file_path, - symbols=[ - Symbol(name="old_func", kind="function", range=(1, 2)), - Symbol(name="Other", kind="class", range=(10, 20)), - ], - index_path=index_path, - ) - - assert any(s.name == "old_func" for s in store.search("old_", prefix_mode=True)) - - # Replace with new set (delete + insert) - store.update_file_symbols( - file_path=file_path, - symbols=[Symbol(name="new_func", kind="function", range=(3, 4))], - index_path=index_path, - ) - - assert not any(s.name == "old_func" for s in store.search("old_", prefix_mode=True)) - assert any(s.name == "new_func" for s in store.search("new_", prefix_mode=True)) - - # Backward-compatible path: omit index_path after it has been established. - store.update_file_symbols( - file_path=file_path, - symbols=[Symbol(name="new_func2", kind="function", range=(5, 6))], - index_path=None, - ) - assert any(s.name == "new_func2" for s in store.search("new_func2", prefix_mode=True)) - - -def test_dir_index_store_updates_global_index_when_enabled(temp_paths: Path): - config = Config(data_dir=temp_paths / "data") - - index_db_path = temp_paths / "indexes" / "proj" / "_index.db" - global_db_path = temp_paths / "indexes" / "proj" / GlobalSymbolIndex.DEFAULT_DB_NAME - source_file = temp_paths / "src" / "x.py" - - source_file.parent.mkdir(parents=True, exist_ok=True) - source_file.write_text("class MyClass:\n pass\n", encoding="utf-8") - - global_index = GlobalSymbolIndex(global_db_path, project_id=123) - global_index.initialize() - - with DirIndexStore(index_db_path, config=config, global_index=global_index) as store: - store.add_file( - name=source_file.name, - full_path=source_file, - content=source_file.read_text(encoding="utf-8"), - language="python", - symbols=[Symbol(name="MyClass", kind="class", range=(1, 2))], - ) - - matches = global_index.search("MyClass", kind="class", limit=10) - assert len(matches) == 1 - assert matches[0].file == str(source_file.resolve()) - - # Verify all required fields were written. - conn = sqlite3.connect(global_db_path) - row = conn.execute( - """ - SELECT project_id, symbol_name, symbol_kind, file_path, start_line, end_line, index_path - FROM global_symbols - WHERE project_id=? AND symbol_name=? - """, - (123, "MyClass"), - ).fetchone() - conn.close() - - assert row is not None - assert row[0] == 123 - assert row[1] == "MyClass" - assert row[2] == "class" - assert row[3] == str(source_file.resolve()) - assert row[4] == 1 - assert row[5] == 2 - assert row[6] == str(index_db_path.resolve()) - - -def test_chain_search_uses_global_index_fast_path(temp_paths: Path): - project_root = temp_paths / "project" - project_root.mkdir(parents=True, exist_ok=True) - - index_root = temp_paths / "indexes" - mapper = PathMapper(index_root=index_root) - index_db_path = mapper.source_to_index_db(project_root) - index_db_path.parent.mkdir(parents=True, exist_ok=True) - index_db_path.write_text("", encoding="utf-8") # existence is enough for _find_start_index - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - project_info = registry.register_project(project_root, mapper.source_to_index_dir(project_root)) - - global_db_path = project_info.index_root / GlobalSymbolIndex.DEFAULT_DB_NAME - global_index = GlobalSymbolIndex(global_db_path, project_id=project_info.id) - global_index.initialize() - - file_path = project_root / "auth.py" - global_index.update_file_symbols( - file_path=file_path, - symbols=[ - Symbol(name="AuthManager", kind="class", range=(1, 10)), - Symbol(name="authenticate", kind="function", range=(12, 20)), - ], - index_path=index_db_path, - ) - - config = Config(data_dir=temp_paths / "data", global_symbol_index_enabled=True) - engine = ChainSearchEngine(registry, mapper, config=config) - assert registry.find_by_source_path(str(project_root)) is not None - assert registry.find_by_source_path(str(project_root.resolve())) is not None - assert global_db_path.exists() - assert GlobalSymbolIndex(global_db_path, project_id=project_info.id).search("Auth", limit=10) - engine._search_symbols_parallel = MagicMock(side_effect=AssertionError("should not traverse chain")) - - symbols = engine.search_symbols("Auth", project_root) - assert any(s.name == "AuthManager" for s in symbols) diff --git a/codex-lens/tests/test_graph_expansion.py b/codex-lens/tests/test_graph_expansion.py deleted file mode 100644 index 6588a5e4..00000000 --- a/codex-lens/tests/test_graph_expansion.py +++ /dev/null @@ -1,188 +0,0 @@ -import sqlite3 -import tempfile -from pathlib import Path - -import pytest - -from codexlens.config import Config -from codexlens.entities import CodeRelationship, RelationshipType, SearchResult, Symbol -from codexlens.search.chain_search import ChainSearchEngine, SearchOptions -from codexlens.search.graph_expander import GraphExpander -from codexlens.storage.dir_index import DirIndexStore -from codexlens.storage.index_tree import _compute_graph_neighbors -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - - -@pytest.fixture() -def temp_paths() -> Path: - tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) - root = Path(tmpdir.name) - yield root - try: - tmpdir.cleanup() - except (PermissionError, OSError): - pass - - -def _create_index_with_neighbors(root: Path) -> tuple[PathMapper, Path, Path]: - project_root = root / "project" - project_root.mkdir(parents=True, exist_ok=True) - - index_root = root / "indexes" - mapper = PathMapper(index_root=index_root) - index_db_path = mapper.source_to_index_db(project_root) - index_db_path.parent.mkdir(parents=True, exist_ok=True) - - content = "\n".join( - [ - "def a():", - " b()", - "", - "def b():", - " c()", - "", - "def c():", - " return 1", - "", - ] - ) - file_path = project_root / "module.py" - file_path.write_text(content, encoding="utf-8") - - symbols = [ - Symbol(name="a", kind="function", range=(1, 2), file=str(file_path)), - Symbol(name="b", kind="function", range=(4, 5), file=str(file_path)), - Symbol(name="c", kind="function", range=(7, 8), file=str(file_path)), - ] - relationships = [ - CodeRelationship( - source_symbol="a", - target_symbol="b", - relationship_type=RelationshipType.CALL, - source_file=str(file_path), - target_file=None, - source_line=2, - ), - CodeRelationship( - source_symbol="b", - target_symbol="c", - relationship_type=RelationshipType.CALL, - source_file=str(file_path), - target_file=None, - source_line=5, - ), - ] - - config = Config(data_dir=root / "data") - store = DirIndexStore(index_db_path, config=config) - store.initialize() - store.add_file( - name=file_path.name, - full_path=file_path, - content=content, - language="python", - symbols=symbols, - relationships=relationships, - ) - _compute_graph_neighbors(store) - store.close() - - return mapper, project_root, file_path - - -def test_graph_neighbors_precomputed_two_hop(temp_paths: Path) -> None: - mapper, project_root, file_path = _create_index_with_neighbors(temp_paths) - index_db_path = mapper.source_to_index_db(project_root) - - conn = sqlite3.connect(str(index_db_path)) - conn.row_factory = sqlite3.Row - try: - rows = conn.execute( - """ - SELECT s1.name AS source_name, s2.name AS neighbor_name, gn.relationship_depth - FROM graph_neighbors gn - JOIN symbols s1 ON s1.id = gn.source_symbol_id - JOIN symbols s2 ON s2.id = gn.neighbor_symbol_id - ORDER BY source_name, neighbor_name, relationship_depth - """ - ).fetchall() - finally: - conn.close() - - triples = {(r["source_name"], r["neighbor_name"], int(r["relationship_depth"])) for r in rows} - assert ("a", "b", 1) in triples - assert ("a", "c", 2) in triples - assert ("b", "c", 1) in triples - assert ("c", "b", 1) in triples - assert file_path.exists() - - -def test_graph_expander_returns_related_results_with_depth_metadata(temp_paths: Path) -> None: - mapper, project_root, file_path = _create_index_with_neighbors(temp_paths) - _ = project_root - - expander = GraphExpander(mapper, config=Config(data_dir=temp_paths / "data", graph_expansion_depth=2)) - base = SearchResult( - path=str(file_path.resolve()), - score=1.0, - excerpt="", - content=None, - start_line=1, - end_line=2, - symbol_name="a", - symbol_kind="function", - ) - related = expander.expand([base], depth=2, max_expand=1, max_related=10) - - depth_by_symbol = {r.symbol_name: r.metadata.get("relationship_depth") for r in related} - assert depth_by_symbol.get("b") == 1 - assert depth_by_symbol.get("c") == 2 - - -def test_chain_search_populates_related_results_when_enabled(temp_paths: Path) -> None: - mapper, project_root, file_path = _create_index_with_neighbors(temp_paths) - _ = file_path - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - - config = Config( - data_dir=temp_paths / "data", - enable_graph_expansion=True, - graph_expansion_depth=2, - ) - engine = ChainSearchEngine(registry, mapper, config=config) - try: - options = SearchOptions(depth=0, total_limit=10, enable_fuzzy=False) - result = engine.search("b", project_root, options) - - assert result.results - assert result.results[0].symbol_name == "a" - - depth_by_symbol = {r.symbol_name: r.metadata.get("relationship_depth") for r in result.related_results} - assert depth_by_symbol.get("b") == 1 - assert depth_by_symbol.get("c") == 2 - finally: - engine.close() - - -def test_chain_search_related_results_empty_when_disabled(temp_paths: Path) -> None: - mapper, project_root, file_path = _create_index_with_neighbors(temp_paths) - _ = file_path - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - - config = Config( - data_dir=temp_paths / "data", - enable_graph_expansion=False, - ) - engine = ChainSearchEngine(registry, mapper, config=config) - try: - options = SearchOptions(depth=0, total_limit=10, enable_fuzzy=False) - result = engine.search("b", project_root, options) - assert result.related_results == [] - finally: - engine.close() - diff --git a/codex-lens/tests/test_hybrid_chunker.py b/codex-lens/tests/test_hybrid_chunker.py deleted file mode 100644 index e9085eff..00000000 --- a/codex-lens/tests/test_hybrid_chunker.py +++ /dev/null @@ -1,622 +0,0 @@ -"""Tests for Hybrid Docstring Chunker.""" - -import pytest - -from codexlens.entities import SemanticChunk, Symbol -from codexlens.semantic.chunker import ( - ChunkConfig, - Chunker, - DocstringExtractor, - HybridChunker, -) - - -class TestDocstringExtractor: - """Tests for DocstringExtractor class.""" - - def test_extract_single_line_python_docstring(self): - """Test extraction of single-line Python docstring.""" - content = '''def hello(): - """This is a docstring.""" - return True -''' - docstrings = DocstringExtractor.extract_python_docstrings(content) - assert len(docstrings) == 1 - assert docstrings[0][1] == 2 # start_line - assert docstrings[0][2] == 2 # end_line - assert '"""This is a docstring."""' in docstrings[0][0] - - def test_extract_multi_line_python_docstring(self): - """Test extraction of multi-line Python docstring.""" - content = '''def process(): - """ - This is a multi-line - docstring with details. - """ - return 42 -''' - docstrings = DocstringExtractor.extract_python_docstrings(content) - assert len(docstrings) == 1 - assert docstrings[0][1] == 2 # start_line - assert docstrings[0][2] == 5 # end_line - assert "multi-line" in docstrings[0][0] - - def test_extract_multiple_python_docstrings(self): - """Test extraction of multiple docstrings from same file.""" - content = '''"""Module docstring.""" - -def func1(): - """Function 1 docstring.""" - pass - -class MyClass: - """Class docstring.""" - - def method(self): - """Method docstring.""" - pass -''' - docstrings = DocstringExtractor.extract_python_docstrings(content) - assert len(docstrings) == 4 - lines = [d[1] for d in docstrings] - assert 1 in lines # Module docstring - assert 4 in lines # func1 docstring - assert 8 in lines # Class docstring - assert 11 in lines # method docstring - - def test_extract_python_docstring_single_quotes(self): - """Test extraction with single quote docstrings.""" - content = """def test(): - '''Single quote docstring.''' - return None -""" - docstrings = DocstringExtractor.extract_python_docstrings(content) - assert len(docstrings) == 1 - assert "Single quote docstring" in docstrings[0][0] - - def test_extract_jsdoc_single_comment(self): - """Test extraction of single JSDoc comment.""" - content = '''/** - * This is a JSDoc comment - * @param {string} name - */ -function hello(name) { - return name; -} -''' - comments = DocstringExtractor.extract_jsdoc_comments(content) - assert len(comments) == 1 - assert comments[0][1] == 1 # start_line - assert comments[0][2] == 4 # end_line - assert "JSDoc comment" in comments[0][0] - - def test_extract_multiple_jsdoc_comments(self): - """Test extraction of multiple JSDoc comments.""" - content = '''/** - * Function 1 - */ -function func1() {} - -/** - * Class description - */ -class MyClass { - /** - * Method description - */ - method() {} -} -''' - comments = DocstringExtractor.extract_jsdoc_comments(content) - assert len(comments) == 3 - - def test_extract_docstrings_unsupported_language(self): - """Test that unsupported languages return empty list.""" - content = "// Some code" - docstrings = DocstringExtractor.extract_docstrings(content, "ruby") - assert len(docstrings) == 0 - - def test_extract_docstrings_empty_content(self): - """Test extraction from empty content.""" - docstrings = DocstringExtractor.extract_python_docstrings("") - assert len(docstrings) == 0 - - -class TestHybridChunker: - """Tests for HybridChunker class.""" - - def test_hybrid_chunker_initialization(self): - """Test HybridChunker initialization with defaults.""" - chunker = HybridChunker() - assert chunker.config is not None - assert chunker.base_chunker is not None - assert chunker.docstring_extractor is not None - - def test_hybrid_chunker_custom_config(self): - """Test HybridChunker with custom config.""" - config = ChunkConfig(max_chunk_size=500, min_chunk_size=20) - chunker = HybridChunker(config=config) - assert chunker.config.max_chunk_size == 500 - assert chunker.config.min_chunk_size == 20 - - def test_hybrid_chunker_isolates_docstrings(self): - """Test that hybrid chunker isolates docstrings into separate chunks.""" - config = ChunkConfig(min_chunk_size=10) - chunker = HybridChunker(config=config) - - content = '''"""Module-level docstring.""" - -def hello(): - """Function docstring.""" - return "world" - -def goodbye(): - """Another docstring.""" - return "farewell" -''' - symbols = [ - Symbol(name="hello", kind="function", range=(3, 5)), - Symbol(name="goodbye", kind="function", range=(7, 9)), - ] - - chunks = chunker.chunk_file(content, symbols, "test.py", "python") - - # Should have 3 docstring chunks + 2 code chunks = 5 total - docstring_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "docstring"] - code_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "code"] - - assert len(docstring_chunks) == 3 - assert len(code_chunks) == 2 - assert all(c.metadata["strategy"] == "hybrid" for c in chunks) - - def test_hybrid_chunker_docstring_isolation_percentage(self): - """Test that >98% of docstrings are isolated correctly.""" - config = ChunkConfig(min_chunk_size=5) - chunker = HybridChunker(config=config) - - # Create content with 10 docstrings - lines = [] - lines.append('"""Module docstring."""\n') - lines.append('\n') - - for i in range(10): - lines.append(f'def func{i}():\n') - lines.append(f' """Docstring for func{i}."""\n') - lines.append(f' return {i}\n') - lines.append('\n') - - content = "".join(lines) - symbols = [ - Symbol(name=f"func{i}", kind="function", range=(3 + i*4, 5 + i*4)) - for i in range(10) - ] - - chunks = chunker.chunk_file(content, symbols, "test.py", "python") - - docstring_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "docstring"] - - # We have 11 docstrings total (1 module + 10 functions) - # Verify >98% isolation (at least 10.78 out of 11) - isolation_rate = len(docstring_chunks) / 11 - assert isolation_rate >= 0.98, f"Docstring isolation rate {isolation_rate:.2%} < 98%" - - def test_hybrid_chunker_javascript_jsdoc(self): - """Test hybrid chunker with JavaScript JSDoc comments.""" - config = ChunkConfig(min_chunk_size=10) - chunker = HybridChunker(config=config) - - content = '''/** - * Main function description - */ -function main() { - return 42; -} - -/** - * Helper function - */ -function helper() { - return 0; -} -''' - symbols = [ - Symbol(name="main", kind="function", range=(4, 6)), - Symbol(name="helper", kind="function", range=(11, 13)), - ] - - chunks = chunker.chunk_file(content, symbols, "test.js", "javascript") - - docstring_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "docstring"] - code_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "code"] - - assert len(docstring_chunks) == 2 - assert len(code_chunks) == 2 - - def test_hybrid_chunker_no_docstrings(self): - """Test hybrid chunker with code containing no docstrings.""" - config = ChunkConfig(min_chunk_size=10) - chunker = HybridChunker(config=config) - - content = '''def hello(): - return "world" - -def goodbye(): - return "farewell" -''' - symbols = [ - Symbol(name="hello", kind="function", range=(1, 2)), - Symbol(name="goodbye", kind="function", range=(4, 5)), - ] - - chunks = chunker.chunk_file(content, symbols, "test.py", "python") - - # All chunks should be code chunks - assert all(c.metadata.get("chunk_type") == "code" for c in chunks) - assert len(chunks) == 2 - - def test_hybrid_chunker_preserves_metadata(self): - """Test that hybrid chunker preserves all required metadata.""" - config = ChunkConfig(min_chunk_size=5) - chunker = HybridChunker(config=config) - - content = '''"""Module doc.""" - -def test(): - """Test doc.""" - pass -''' - symbols = [Symbol(name="test", kind="function", range=(3, 5))] - - chunks = chunker.chunk_file(content, symbols, "/path/to/file.py", "python") - - for chunk in chunks: - assert "file" in chunk.metadata - assert "language" in chunk.metadata - assert "chunk_type" in chunk.metadata - assert "start_line" in chunk.metadata - assert "end_line" in chunk.metadata - assert "strategy" in chunk.metadata - assert chunk.metadata["strategy"] == "hybrid" - - def test_hybrid_chunker_no_symbols_fallback(self): - """Test hybrid chunker falls back to sliding window when no symbols.""" - config = ChunkConfig(min_chunk_size=5, max_chunk_size=100) - chunker = HybridChunker(config=config) - - content = '''"""Module docstring.""" - -# Just some comments -x = 42 -y = 100 -''' - chunks = chunker.chunk_file(content, [], "test.py", "python") - - # Should have 1 docstring chunk + sliding window chunks for remaining code - docstring_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "docstring"] - code_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "code"] - - assert len(docstring_chunks) == 1 - assert len(code_chunks) >= 0 # May or may not have code chunks depending on size - - def test_get_excluded_line_ranges(self): - """Test _get_excluded_line_ranges helper method.""" - chunker = HybridChunker() - - docstrings = [ - ("doc1", 1, 3), - ("doc2", 5, 7), - ("doc3", 10, 10), - ] - - excluded = chunker._get_excluded_line_ranges(docstrings) - - assert 1 in excluded - assert 2 in excluded - assert 3 in excluded - assert 4 not in excluded - assert 5 in excluded - assert 6 in excluded - assert 7 in excluded - assert 8 not in excluded - assert 9 not in excluded - assert 10 in excluded - - def test_filter_symbols_outside_docstrings(self): - """Test _filter_symbols_outside_docstrings helper method.""" - chunker = HybridChunker() - - symbols = [ - Symbol(name="func1", kind="function", range=(1, 5)), - Symbol(name="func2", kind="function", range=(10, 15)), - Symbol(name="func3", kind="function", range=(20, 25)), - ] - - # Exclude lines 1-5 (func1) and 10-12 (partial overlap with func2) - excluded_lines = set(range(1, 6)) | set(range(10, 13)) - - filtered = chunker._filter_symbols_outside_docstrings(symbols, excluded_lines) - - # func1 should be filtered out (completely within excluded) - # func2 should remain (partial overlap) - # func3 should remain (no overlap) - assert len(filtered) == 2 - names = [s.name for s in filtered] - assert "func1" not in names - assert "func2" in names - assert "func3" in names - excluded = chunker._get_excluded_line_ranges(docstrings) - - assert 1 in excluded - assert 2 in excluded - assert 3 in excluded - assert 4 not in excluded - assert 5 in excluded - assert 6 in excluded - assert 7 in excluded - assert 8 not in excluded - assert 9 not in excluded - assert 10 in excluded - - def test_filter_symbols_outside_docstrings(self): - """Test _filter_symbols_outside_docstrings helper method.""" - chunker = HybridChunker() - - symbols = [ - Symbol(name="func1", kind="function", range=(1, 5)), - Symbol(name="func2", kind="function", range=(10, 15)), - Symbol(name="func3", kind="function", range=(20, 25)), - ] - - # Exclude lines 1-5 (func1) and 10-12 (partial overlap with func2) - excluded_lines = set(range(1, 6)) | set(range(10, 13)) - - filtered = chunker._filter_symbols_outside_docstrings(symbols, excluded_lines) - - # func1 should be filtered out (completely within excluded) - # func2 should remain (partial overlap) - # func3 should remain (no overlap) - assert len(filtered) == 2 - names = [s.name for s in filtered] - assert "func1" not in names - assert "func2" in names - assert "func3" in names - - def test_hybrid_chunker_docstring_only_file(self): - """Test that hybrid chunker correctly handles file with only docstrings.""" - config = ChunkConfig(min_chunk_size=5) - chunker = HybridChunker(config=config) - - content = '''"""First docstring.""" - -"""Second docstring.""" - -"""Third docstring.""" -''' - chunks = chunker.chunk_file(content, [], "test.py", "python") - - # Should only have docstring chunks - assert all(c.metadata.get("chunk_type") == "docstring" for c in chunks) - assert len(chunks) == 3 - - -class TestChunkConfigStrategy: - """Tests for strategy field in ChunkConfig.""" - - def test_chunk_config_default_strategy(self): - """Test that default strategy is 'auto'.""" - config = ChunkConfig() - assert config.strategy == "auto" - - def test_chunk_config_custom_strategy(self): - """Test setting custom strategy.""" - config = ChunkConfig(strategy="hybrid") - assert config.strategy == "hybrid" - - config = ChunkConfig(strategy="symbol") - assert config.strategy == "symbol" - - config = ChunkConfig(strategy="sliding_window") - assert config.strategy == "sliding_window" - - -class TestHybridChunkerIntegration: - """Integration tests for hybrid chunker with realistic code.""" - - def test_realistic_python_module(self): - """Test hybrid chunker with realistic Python module.""" - config = ChunkConfig(min_chunk_size=10) - chunker = HybridChunker(config=config) - - content = '''""" -Data processing module for handling user data. - -This module provides functions for cleaning and validating user input. -""" - -from typing import Dict, Any - - -def validate_email(email: str) -> bool: - """ - Validate an email address format. - - Args: - email: The email address to validate - - Returns: - True if valid, False otherwise - """ - import re - pattern = r'^[\\w\\.-]+@[\\w\\.-]+\\.\\w+$' - return bool(re.match(pattern, email)) - - -class UserProfile: - """ - User profile management class. - - Handles user data storage and retrieval. - """ - - def __init__(self, user_id: int): - """Initialize user profile with ID.""" - self.user_id = user_id - self.data = {} - - def update_data(self, data: Dict[str, Any]) -> None: - """ - Update user profile data. - - Args: - data: Dictionary of user data to update - """ - self.data.update(data) -''' - - symbols = [ - Symbol(name="validate_email", kind="function", range=(11, 23)), - Symbol(name="UserProfile", kind="class", range=(26, 44)), - ] - - chunks = chunker.chunk_file(content, symbols, "users.py", "python") - - docstring_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "docstring"] - code_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "code"] - - # Verify docstrings are isolated - assert len(docstring_chunks) >= 4 # Module, function, class, methods - assert len(code_chunks) >= 1 # At least one code chunk - - # Verify >98% docstring isolation - # Count total docstring lines in original - total_docstring_lines = sum( - d[2] - d[1] + 1 - for d in DocstringExtractor.extract_python_docstrings(content) - ) - isolated_docstring_lines = sum( - c.metadata["end_line"] - c.metadata["start_line"] + 1 - for c in docstring_chunks - ) - - isolation_rate = isolated_docstring_lines / total_docstring_lines if total_docstring_lines > 0 else 1 - assert isolation_rate >= 0.98 - - def test_hybrid_chunker_performance_overhead(self): - """Test that hybrid chunker has <5% overhead vs base chunker on files without docstrings.""" - import time - - config = ChunkConfig(min_chunk_size=5) - - # Create larger content with NO docstrings (worst case for hybrid chunker) - lines = [] - for i in range(1000): - lines.append(f'def func{i}():\n') - lines.append(f' x = {i}\n') - lines.append(f' y = {i * 2}\n') - lines.append(f' return x + y\n') - lines.append('\n') - content = "".join(lines) - - symbols = [ - Symbol(name=f"func{i}", kind="function", range=(1 + i*5, 4 + i*5)) - for i in range(1000) - ] - - # Warm up - base_chunker = Chunker(config=config) - base_chunker.chunk_file(content[:100], symbols[:10], "test.py", "python") - - hybrid_chunker = HybridChunker(config=config) - hybrid_chunker.chunk_file(content[:100], symbols[:10], "test.py", "python") - - # Measure base chunker (3 runs) - base_times = [] - for _ in range(3): - start = time.perf_counter() - base_chunker.chunk_file(content, symbols, "test.py", "python") - base_times.append(time.perf_counter() - start) - base_time = sum(base_times) / len(base_times) - - # Measure hybrid chunker (3 runs) - hybrid_times = [] - for _ in range(3): - start = time.perf_counter() - hybrid_chunker.chunk_file(content, symbols, "test.py", "python") - hybrid_times.append(time.perf_counter() - start) - hybrid_time = sum(hybrid_times) / len(hybrid_times) - - # Calculate overhead - overhead = ((hybrid_time - base_time) / base_time) * 100 if base_time > 0 else 0 - - # Verify <15% overhead (reasonable threshold for performance tests with system variance) - assert overhead < 15.0, f"Overhead {overhead:.2f}% exceeds 15% threshold (base={base_time:.4f}s, hybrid={hybrid_time:.4f}s)" - - -class TestHybridChunkerV1Optimizations: - """Tests for v1.0 optimization behaviors (parent metadata + determinism).""" - - def test_merged_docstring_metadata(self): - """Docstring chunks include parent_symbol metadata when applicable.""" - config = ChunkConfig(min_chunk_size=1) - chunker = HybridChunker(config=config) - - content = '''"""Module docstring.""" - -def hello(): - """Function docstring.""" - return 1 -''' - symbols = [Symbol(name="hello", kind="function", range=(3, 5))] - - chunks = chunker.chunk_file(content, symbols, "m.py", "python") - func_doc_chunks = [ - c for c in chunks - if c.metadata.get("chunk_type") == "docstring" and c.metadata.get("start_line") == 4 - ] - assert len(func_doc_chunks) == 1 - assert func_doc_chunks[0].metadata.get("parent_symbol") == "hello" - assert func_doc_chunks[0].metadata.get("parent_symbol_kind") == "function" - - def test_deterministic_chunk_boundaries(self): - """Chunk boundaries are stable across repeated runs on identical input.""" - config = ChunkConfig(max_chunk_size=80, overlap=10, min_chunk_size=1) - chunker = HybridChunker(config=config) - - # No docstrings, no symbols -> sliding window path. - content = "\n".join([f"line {i}: x = {i}" for i in range(1, 200)]) + "\n" - - boundaries = [] - for _ in range(3): - chunks = chunker.chunk_file(content, [], "deterministic.py", "python") - boundaries.append([ - (c.metadata.get("start_line"), c.metadata.get("end_line")) - for c in chunks - if c.metadata.get("chunk_type") == "code" - ]) - - assert boundaries[0] == boundaries[1] == boundaries[2] - - def test_orphan_docstrings(self): - """Module-level docstrings remain standalone (no parent_symbol assigned).""" - config = ChunkConfig(min_chunk_size=1) - chunker = HybridChunker(config=config) - - content = '''"""Module-level docstring.""" - -def hello(): - """Function docstring.""" - return 1 -''' - symbols = [Symbol(name="hello", kind="function", range=(3, 5))] - chunks = chunker.chunk_file(content, symbols, "orphan.py", "python") - - module_doc = [ - c for c in chunks - if c.metadata.get("chunk_type") == "docstring" and c.metadata.get("start_line") == 1 - ] - assert len(module_doc) == 1 - assert module_doc[0].metadata.get("parent_symbol") is None - - code_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "code"] - assert code_chunks, "Expected at least one code chunk" - assert all("Module-level docstring" not in c.content for c in code_chunks) diff --git a/codex-lens/tests/test_hybrid_search_e2e.py b/codex-lens/tests/test_hybrid_search_e2e.py deleted file mode 100644 index 131aad14..00000000 --- a/codex-lens/tests/test_hybrid_search_e2e.py +++ /dev/null @@ -1,945 +0,0 @@ -"""End-to-end tests for hybrid search workflows (P2). - -Tests complete hybrid search pipeline including indexing, exact/fuzzy/hybrid modes, -and result relevance with real project structure. -""" - -import sqlite3 -import tempfile -from pathlib import Path - -import pytest - -from codexlens.config import Config -from codexlens.entities import SearchResult -from codexlens.search.hybrid_search import HybridSearchEngine -from codexlens.storage.dir_index import DirIndexStore - -# Check if pytest-benchmark is available -try: - import pytest_benchmark - BENCHMARK_AVAILABLE = True -except ImportError: - BENCHMARK_AVAILABLE = False - - -class TestHybridSearchBasics: - """Basic tests for HybridSearchEngine.""" - - @pytest.fixture - def temp_db(self): - """Create temporary database.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - yield db_path - if db_path.exists(): - db_path.unlink() - - @pytest.fixture - def index_store(self, temp_db): - """Create DirIndexStore instance.""" - store = DirIndexStore(temp_db) - yield store - store.close() - - def test_engine_initialization(self): - """Test HybridSearchEngine initializes with default weights.""" - engine = HybridSearchEngine() - assert engine.weights == HybridSearchEngine.DEFAULT_WEIGHTS - assert engine.weights["exact"] == 0.3 - assert engine.weights["fuzzy"] == 0.1 - assert engine.weights["vector"] == 0.6 - - def test_engine_custom_weights(self): - """Test HybridSearchEngine accepts custom weights.""" - custom_weights = {"exact": 0.5, "fuzzy": 0.5, "vector": 0.0} - engine = HybridSearchEngine(weights=custom_weights) - assert engine.weights == custom_weights - - def test_search_requires_index(self, temp_db): - """Test search requires initialized index.""" - engine = HybridSearchEngine() - # Empty database - should handle gracefully - results = engine.search(temp_db, "test", limit=10) - # May return empty or raise error - either is acceptable - assert isinstance(results, list) - - -class TestHybridSearchWithSampleProject: - """Tests with sample project structure.""" - - @pytest.fixture - def sample_project_db(self): - """Create database with sample Python + TypeScript project.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - store = DirIndexStore(db_path) - store.initialize() - - # Sample Python files - python_files = { - "src/auth/authentication.py": """ -def authenticate_user(username, password): - '''Authenticate user with credentials''' - return check_credentials(username, password) - -def check_credentials(user, pwd): - return True -""", - "src/auth/authorization.py": """ -def authorize_user(user_id, resource): - '''Authorize user access to resource''' - return check_permissions(user_id, resource) - -def check_permissions(uid, res): - return True -""", - "src/models/user.py": """ -class User: - def __init__(self, username, email): - self.username = username - self.email = email - - def authenticate(self, password): - return authenticate_user(self.username, password) -""", - "src/api/user_api.py": """ -from flask import Flask, request - -def get_user_by_id(user_id): - '''Get user by ID''' - return User.query.get(user_id) - -def create_user(username, email): - '''Create new user''' - return User(username, email) -""", - } - - # Sample TypeScript files - typescript_files = { - "frontend/auth/AuthService.ts": """ -export class AuthService { - authenticateUser(username: string, password: string): boolean { - return this.checkCredentials(username, password); - } - - private checkCredentials(user: string, pwd: string): boolean { - return true; - } -} -""", - "frontend/models/User.ts": """ -export interface User { - id: number; - username: string; - email: string; -} - -export class UserModel { - constructor(private user: User) {} - - authenticate(password: string): boolean { - return new AuthService().authenticateUser(this.user.username, password); - } -} -""", - } - - # Index all files - with store._get_connection() as conn: - for path, content in {**python_files, **typescript_files}.items(): - lang = "python" if path.endswith(".py") else "typescript" - name = path.split('/')[-1] - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, path, content, lang, 0.0) - ) - conn.commit() - - yield db_path - store.close() - - if db_path.exists(): - db_path.unlink() - - def test_exact_search_mode(self, sample_project_db): - """Test exact FTS search mode.""" - engine = HybridSearchEngine() - - # Search for "authenticate" - results = engine.search( - sample_project_db, - "authenticate", - limit=10, - enable_fuzzy=False, - enable_vector=False - ) - - assert len(results) > 0, "Should find matches for 'authenticate'" - # Check results contain expected files - paths = [r.path for r in results] - assert any("authentication.py" in p for p in paths) - - def test_fuzzy_search_mode(self, sample_project_db): - """Test fuzzy FTS search mode.""" - engine = HybridSearchEngine() - - # Search with typo: "authentcate" (missing 'i') - results = engine.search( - sample_project_db, - "authentcate", - limit=10, - enable_fuzzy=True, - enable_vector=False - ) - - # Fuzzy search should still find matches - assert isinstance(results, list) - # May or may not find matches depending on trigram support - - def test_hybrid_search_mode(self, sample_project_db): - """Test hybrid search combines exact and fuzzy.""" - engine = HybridSearchEngine() - - # Hybrid search - results = engine.search( - sample_project_db, - "authenticate", - limit=10, - enable_fuzzy=True, - enable_vector=False - ) - - assert len(results) > 0, "Hybrid search should find matches" - # Results should have fusion scores - for result in results: - assert result.score > 0, "Results should have fusion scores" - - def test_camelcase_query_expansion(self, sample_project_db): - """Test CamelCase query expansion improves recall.""" - engine = HybridSearchEngine() - - # Search for "AuthService" (CamelCase) - results = engine.search( - sample_project_db, - "AuthService", - limit=10, - enable_fuzzy=False - ) - - # Should find TypeScript AuthService class - paths = [r.path for r in results] - assert any("AuthService.ts" in p for p in paths), \ - "Should find AuthService with CamelCase query" - - def test_snake_case_query_expansion(self, sample_project_db): - """Test snake_case query expansion improves recall.""" - engine = HybridSearchEngine() - - # Search for "get_user_by_id" (snake_case) - results = engine.search( - sample_project_db, - "get_user_by_id", - limit=10, - enable_fuzzy=False - ) - - # Should find Python function - paths = [r.path for r in results] - assert any("user_api.py" in p for p in paths), \ - "Should find get_user_by_id with snake_case query" - - def test_partial_identifier_match(self, sample_project_db): - """Test partial identifier matching with query expansion.""" - engine = HybridSearchEngine() - - # Search for just "User" (part of UserModel, User class, etc.) - results = engine.search( - sample_project_db, - "User", - limit=10, - enable_fuzzy=False - ) - - assert len(results) > 0, "Should find matches for 'User'" - # Should find multiple files with User in name - paths = [r.path for r in results] - assert len([p for p in paths if "user" in p.lower()]) > 0 - - -class TestHybridSearchRelevance: - """Tests for result relevance and ranking.""" - - @pytest.fixture - def relevance_db(self): - """Create database for testing relevance ranking.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - store = DirIndexStore(db_path) - store.initialize() - - # Files with varying relevance to "authentication" - files = { - "auth/authentication.py": """ -# Primary authentication module -def authenticate_user(username, password): - '''Main authentication function''' - pass - -def validate_authentication(token): - pass -""", - "auth/auth_helpers.py": """ -# Helper functions for authentication -def hash_password(password): - pass - -def verify_authentication_token(token): - pass -""", - "models/user.py": """ -# User model (mentions authentication once) -class User: - def check_authentication(self): - pass -""", - "utils/logging.py": """ -# Logging utility (no authentication mention) -def log_message(msg): - pass -""", - } - - with store._get_connection() as conn: - for path, content in files.items(): - name = path.split('/')[-1] - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, path, content, "python", 0.0) - ) - conn.commit() - - yield db_path - store.close() - - if db_path.exists(): - db_path.unlink() - - def test_exact_match_ranks_higher(self, relevance_db): - """Test files with exact term matches rank higher.""" - engine = HybridSearchEngine() - - results = engine.search( - relevance_db, - "authentication", - limit=10, - enable_fuzzy=False - ) - - # First result should be authentication.py (most mentions) - assert len(results) > 0 - assert "authentication.py" in results[0].path, \ - "File with most mentions should rank first" - - def test_hybrid_fusion_improves_ranking(self, relevance_db): - """Test hybrid RRF fusion improves ranking over single source.""" - engine = HybridSearchEngine() - - # Exact only - exact_results = engine.search( - relevance_db, - "authentication", - limit=5, - enable_fuzzy=False - ) - - # Hybrid - hybrid_results = engine.search( - relevance_db, - "authentication", - limit=5, - enable_fuzzy=True - ) - - # Both should find matches - assert len(exact_results) > 0 - assert len(hybrid_results) > 0 - - # Hybrid may rerank results - assert isinstance(hybrid_results[0], SearchResult) - - -class TestHybridSearchPerformance: - """Performance tests for hybrid search.""" - - @pytest.fixture - def large_project_db(self): - """Create database with many files.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - store = DirIndexStore(db_path) - store.initialize() - - # Create 100 test files - with store._get_connection() as conn: - for i in range(100): - content = f""" -def function_{i}(param): - '''Test function {i}''' - return authenticate_user(param) - -class Class{i}: - def method_{i}(self): - pass -""" - path = f"src/module_{i}.py" - name = f"module_{i}.py" - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, path, content, "python", 0.0) - ) - conn.commit() - - yield db_path - store.close() - - if db_path.exists(): - db_path.unlink() - - @pytest.mark.skipif(not BENCHMARK_AVAILABLE, reason="pytest-benchmark not installed") - def test_search_latency(self, large_project_db, benchmark): - """Benchmark search latency.""" - engine = HybridSearchEngine() - - def search_query(): - return engine.search( - large_project_db, - "authenticate", - limit=20, - enable_fuzzy=True - ) - - # Should complete in reasonable time - results = benchmark(search_query) - assert isinstance(results, list) - - def test_hybrid_overhead(self, large_project_db): - """Test hybrid search overhead vs exact search.""" - engine = HybridSearchEngine() - - import time - - # Measure exact search time - start = time.time() - exact_results = engine.search( - large_project_db, - "authenticate", - limit=20, - enable_fuzzy=False - ) - exact_time = time.time() - start - - # Measure hybrid search time - start = time.time() - hybrid_results = engine.search( - large_project_db, - "authenticate", - limit=20, - enable_fuzzy=True - ) - hybrid_time = time.time() - start - - # Hybrid should be <10x slower than exact (relaxed for CI stability and ANN initialization overhead) - if exact_time > 0: - overhead = hybrid_time / exact_time - assert overhead < 10.0, f"Hybrid overhead {overhead:.1f}x should be <10x" - - -class TestHybridSearchEdgeCases: - """Edge case tests for hybrid search.""" - - @pytest.fixture - def temp_db(self): - """Create temporary database.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - # Initialize with schema - DirIndexStore(db_path) - - yield db_path - # Ignore file deletion errors on Windows (SQLite file lock) - try: - if db_path.exists(): - db_path.unlink() - except PermissionError: - pass - - def test_empty_index_search(self, temp_db): - """Test search on empty index returns empty results.""" - engine = HybridSearchEngine() - - results = engine.search(temp_db, "test", limit=10) - assert results == [] or isinstance(results, list) - - def test_no_matches_query(self, temp_db): - """Test query with no matches returns empty results.""" - store = DirIndexStore(temp_db) - store.initialize() - - try: - # Index one file - with store._get_connection() as conn: - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - ("test.py", "test.py", "def hello(): pass", "python", 0.0) - ) - conn.commit() - - engine = HybridSearchEngine() - results = engine.search(temp_db, "nonexistent", limit=10) - - assert results == [] or len(results) == 0 - finally: - store.close() - - def test_special_characters_in_query(self, temp_db): - """Test queries with special characters are handled.""" - store = DirIndexStore(temp_db) - store.initialize() - - try: - # Index file - with store._get_connection() as conn: - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - ("test.py", "test.py", "def test(): pass", "python", 0.0) - ) - conn.commit() - - engine = HybridSearchEngine() - - # Query with special chars should not crash - queries = ["test*", "test?", "test&", "test|"] - for query in queries: - try: - results = engine.search(temp_db, query, limit=10) - assert isinstance(results, list) - except Exception: - # Some queries may be invalid FTS5 syntax - that's OK - pass - finally: - store.close() - - def test_very_long_query(self, temp_db): - """Test very long queries are handled.""" - store = DirIndexStore(temp_db) - store.initialize() - - try: - # Index file - with store._get_connection() as conn: - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - ("test.py", "test.py", "def test(): pass", "python", 0.0) - ) - conn.commit() - - engine = HybridSearchEngine() - - # Very long query - long_query = "test " * 100 - results = engine.search(temp_db, long_query, limit=10) - assert isinstance(results, list) - finally: - store.close() - - def test_unicode_query(self, temp_db): - """Test Unicode queries are handled.""" - store = DirIndexStore(temp_db) - store.initialize() - - try: - # Index file with Unicode content - with store._get_connection() as conn: - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - ("test.py", "test.py", "def 测试函数(): pass", "python", 0.0) - ) - conn.commit() - - engine = HybridSearchEngine() - - # Unicode query - results = engine.search(temp_db, "测试", limit=10) - assert isinstance(results, list) - finally: - store.close() - - -class TestHybridSearchIntegration: - """Integration tests for complete workflow.""" - - @pytest.fixture - def project_db(self): - """Create realistic project database.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - store = DirIndexStore(db_path) - store.initialize() - - # Realistic project structure - files = { - "src/authentication/login.py": "def login_user(username, password): pass", - "src/authentication/logout.py": "def logout_user(session_id): pass", - "src/authorization/permissions.py": "def check_permission(user, resource): pass", - "src/models/user_model.py": "class UserModel: pass", - "src/api/auth_api.py": "def authenticate_api(token): pass", - "tests/test_auth.py": "def test_authentication(): pass", - } - - with store._get_connection() as conn: - for path, content in files.items(): - name = path.split('/')[-1] - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, path, content, "python", 0.0) - ) - conn.commit() - - yield db_path - store.close() - - if db_path.exists(): - db_path.unlink() - - def test_workflow_index_search_refine(self, project_db): - """Test complete workflow: index → search → refine.""" - engine = HybridSearchEngine() - - # Initial broad search - results = engine.search(project_db, "auth", limit=20) - assert len(results) > 0 - - # Refined search - refined = engine.search(project_db, "authentication", limit=10) - assert len(refined) > 0 - - # Most refined search - specific = engine.search(project_db, "login_user", limit=5) - # May or may not find exact match depending on query expansion - - def test_consistency_across_searches(self, project_db): - """Test search results are consistent across multiple calls.""" - engine = HybridSearchEngine() - - # Same query multiple times - results1 = engine.search(project_db, "authenticate", limit=10) - results2 = engine.search(project_db, "authenticate", limit=10) - - # Should return same results (same order) - assert len(results1) == len(results2) - if len(results1) > 0: - assert results1[0].path == results2[0].path - - -@pytest.mark.integration -class TestHybridSearchFullCoverage: - """Full coverage integration tests.""" - - def test_all_modes_with_real_project(self): - """Test all search modes (exact, fuzzy, hybrid) with realistic project.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - store = None - try: - store = DirIndexStore(db_path) - store.initialize() - - # Create comprehensive test project - files = { - "auth.py": "def authenticate(): pass", - "authz.py": "def authorize(): pass", - "user.py": "class User: pass", - } - - with store._get_connection() as conn: - for path, content in files.items(): - name = path.split('/')[-1] - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, path, content, "python", 0.0) - ) - conn.commit() - - engine = HybridSearchEngine() - - # Test exact mode - exact = engine.search(db_path, "authenticate", enable_fuzzy=False) - assert isinstance(exact, list) - - # Test fuzzy mode - fuzzy = engine.search(db_path, "authenticate", enable_fuzzy=True) - assert isinstance(fuzzy, list) - - # Test hybrid mode (default) - hybrid = engine.search(db_path, "authenticate") - assert isinstance(hybrid, list) - - finally: - if store: - store.close() - if db_path.exists(): - db_path.unlink() - - - -class TestHybridSearchWithVectorMock: - """Tests for hybrid search with mocked vector search.""" - - @pytest.fixture - def mock_vector_db(self): - """Create database with vector search mocked.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - store = DirIndexStore(db_path) - store.initialize() - - # Index sample files - files = { - "auth/login.py": "def login_user(username, password): authenticate()", - "auth/logout.py": "def logout_user(session): cleanup_session()", - "user/profile.py": "class UserProfile: def get_data(): pass" - } - - with store._get_connection() as conn: - for path, content in files.items(): - name = path.split('/')[-1] - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, path, content, "python", 0.0) - ) - conn.commit() - - yield db_path - store.close() - - if db_path.exists(): - db_path.unlink() - - def test_hybrid_with_vector_enabled(self, mock_vector_db): - """Test hybrid search with vector search enabled (mocked).""" - from unittest.mock import patch, MagicMock - - # Mock the vector search to return fake results - mock_vector_results = [ - SearchResult(path="auth/login.py", score=0.95, content_snippet="login"), - SearchResult(path="user/profile.py", score=0.75, content_snippet="profile") - ] - - engine = HybridSearchEngine() - - # Mock vector search method if it exists - with patch.object(engine, '_search_vector', return_value=mock_vector_results) if hasattr(engine, '_search_vector') else patch('codexlens.search.hybrid_search.vector_search', return_value=mock_vector_results): - results = engine.search( - mock_vector_db, - "login", - limit=10, - enable_fuzzy=True, - enable_vector=True # ENABLE vector search - ) - - # Should get results from RRF fusion of exact + fuzzy + vector - assert isinstance(results, list) - assert len(results) > 0, "Hybrid search with vector should return results" - - # Results should have fusion scores - for result in results: - assert hasattr(result, 'score') - assert result.score > 0 # RRF fusion scores are positive - - -class TestHybridSearchAdaptiveWeights: - """Integration tests for adaptive RRF weights + reranking gating.""" - - def test_adaptive_weights_code_query(self): - """Exact weight should dominate for code-like queries.""" - from unittest.mock import patch - - engine = HybridSearchEngine() - - results_map = { - "exact": [SearchResult(path="a.py", score=10.0, excerpt="a")], - "fuzzy": [SearchResult(path="b.py", score=9.0, excerpt="b")], - "vector": [SearchResult(path="c.py", score=0.9, excerpt="c")], - } - - captured = {} - from codexlens.search import ranking as ranking_module - - def capture_rrf(map_in, weights_in, k=60): - captured["weights"] = dict(weights_in) - return ranking_module.reciprocal_rank_fusion(map_in, weights_in, k=k) - - with patch.object(HybridSearchEngine, "_search_parallel", return_value=results_map), patch( - "codexlens.search.hybrid_search.reciprocal_rank_fusion", - side_effect=capture_rrf, - ): - engine.search(Path("dummy.db"), "def authenticate", enable_vector=True) - - assert captured["weights"]["exact"] > 0.4 - - def test_adaptive_weights_nl_query(self): - """Vector weight should dominate for natural-language queries.""" - from unittest.mock import patch - - engine = HybridSearchEngine() - - results_map = { - "exact": [SearchResult(path="a.py", score=10.0, excerpt="a")], - "fuzzy": [SearchResult(path="b.py", score=9.0, excerpt="b")], - "vector": [SearchResult(path="c.py", score=0.9, excerpt="c")], - } - - captured = {} - from codexlens.search import ranking as ranking_module - - def capture_rrf(map_in, weights_in, k=60): - captured["weights"] = dict(weights_in) - return ranking_module.reciprocal_rank_fusion(map_in, weights_in, k=k) - - with patch.object(HybridSearchEngine, "_search_parallel", return_value=results_map), patch( - "codexlens.search.hybrid_search.reciprocal_rank_fusion", - side_effect=capture_rrf, - ): - engine.search(Path("dummy.db"), "how to handle user login", enable_vector=True) - - assert captured["weights"]["vector"] > 0.6 - - def test_default_engine_weights_keep_lsp_graph_backend_available(self): - """Legacy public defaults should not discard LSP graph fusion weights internally.""" - from unittest.mock import patch - - engine = HybridSearchEngine() - - results_map = { - "exact": [SearchResult(path="a.py", score=10.0, excerpt="a")], - "fuzzy": [SearchResult(path="b.py", score=9.0, excerpt="b")], - "vector": [SearchResult(path="c.py", score=0.9, excerpt="c")], - "lsp_graph": [SearchResult(path="d.py", score=0.8, excerpt="d")], - } - - captured = {} - from codexlens.search import ranking as ranking_module - - def capture_rrf(map_in, weights_in, k=60): - captured["weights"] = dict(weights_in) - return ranking_module.reciprocal_rank_fusion(map_in, weights_in, k=k) - - with patch.object(HybridSearchEngine, "_search_parallel", return_value=results_map), patch( - "codexlens.search.hybrid_search.reciprocal_rank_fusion", - side_effect=capture_rrf, - ): - engine.search(Path("dummy.db"), "auth flow", enable_vector=True, enable_lsp_graph=True) - - assert engine.weights == HybridSearchEngine.DEFAULT_WEIGHTS - assert "lsp_graph" in captured["weights"] - assert captured["weights"]["lsp_graph"] > 0.0 - - def test_reranking_enabled(self, tmp_path): - """Reranking runs only when explicitly enabled via config.""" - from unittest.mock import patch - - results_map = { - "exact": [SearchResult(path="a.py", score=10.0, excerpt="a")], - "fuzzy": [SearchResult(path="b.py", score=9.0, excerpt="b")], - "vector": [SearchResult(path="c.py", score=0.9, excerpt="c")], - } - - class DummyEmbedder: - def embed(self, texts): - if isinstance(texts, str): - texts = [texts] - return [[1.0, 0.0] for _ in texts] - - # Disabled: should not invoke rerank_results - config_off = Config(data_dir=tmp_path / "off", enable_reranking=False) - engine_off = HybridSearchEngine(config=config_off, embedder=DummyEmbedder()) - - with patch.object(HybridSearchEngine, "_search_parallel", return_value=results_map), patch( - "codexlens.search.hybrid_search.rerank_results" - ) as rerank_mock: - engine_off.search(Path("dummy.db"), "query", enable_vector=True) - rerank_mock.assert_not_called() - - # Enabled: should invoke rerank_results once - config_on = Config(data_dir=tmp_path / "on", enable_reranking=True, reranking_top_k=10) - engine_on = HybridSearchEngine(config=config_on, embedder=DummyEmbedder()) - - with patch.object(HybridSearchEngine, "_search_parallel", return_value=results_map), patch( - "codexlens.search.hybrid_search.rerank_results", - side_effect=lambda q, r, e, top_k=50: r, - ) as rerank_mock: - engine_on.search(Path("dummy.db"), "query", enable_vector=True) - assert rerank_mock.call_count == 1 - - def test_cross_encoder_reranking_enabled(self, tmp_path): - """Cross-encoder stage runs only when explicitly enabled via config.""" - from unittest.mock import patch - - results_map = { - "exact": [SearchResult(path="a.py", score=10.0, excerpt="a")], - "fuzzy": [SearchResult(path="b.py", score=9.0, excerpt="b")], - "vector": [SearchResult(path="c.py", score=0.9, excerpt="c")], - } - - class DummyEmbedder: - def embed(self, texts): - if isinstance(texts, str): - texts = [texts] - return [[1.0, 0.0] for _ in texts] - - class DummyReranker: - def score_pairs(self, pairs, batch_size=32): - return [0.0 for _ in pairs] - - config = Config( - data_dir=tmp_path / "ce", - enable_reranking=True, - enable_cross_encoder_rerank=True, - reranker_top_k=10, - ) - engine = HybridSearchEngine(config=config, embedder=DummyEmbedder()) - - with patch.object(HybridSearchEngine, "_search_parallel", return_value=results_map), patch( - "codexlens.search.hybrid_search.rerank_results", - side_effect=lambda q, r, e, top_k=50: r, - ) as rerank_mock, patch.object( - HybridSearchEngine, - "_get_cross_encoder_reranker", - return_value=DummyReranker(), - ) as get_ce_mock, patch( - "codexlens.search.hybrid_search.cross_encoder_rerank", - side_effect=lambda q, r, ce, top_k=50: r, - ) as ce_mock: - engine.search(Path("dummy.db"), "query", enable_vector=True) - assert rerank_mock.call_count == 1 - assert get_ce_mock.call_count == 1 - assert ce_mock.call_count == 1 diff --git a/codex-lens/tests/test_hybrid_search_reranker_backend.py b/codex-lens/tests/test_hybrid_search_reranker_backend.py deleted file mode 100644 index 1e832640..00000000 --- a/codex-lens/tests/test_hybrid_search_reranker_backend.py +++ /dev/null @@ -1,192 +0,0 @@ -"""Tests for HybridSearchEngine reranker backend selection.""" - -from __future__ import annotations - -import pytest - -from codexlens.config import Config -from codexlens.search.hybrid_search import HybridSearchEngine - - -def test_get_cross_encoder_reranker_uses_factory_backend_legacy( - monkeypatch: pytest.MonkeyPatch, - tmp_path, -) -> None: - calls: dict[str, object] = {} - - def fake_check_reranker_available(backend: str): - calls["check_backend"] = backend - return True, None - - sentinel = object() - - def fake_get_reranker(*, backend: str, model_name=None, device=None, **kwargs): - calls["get_args"] = { - "backend": backend, - "model_name": model_name, - "device": device, - "kwargs": kwargs, - } - return sentinel - - monkeypatch.setattr( - "codexlens.semantic.reranker.check_reranker_available", - fake_check_reranker_available, - ) - monkeypatch.setattr( - "codexlens.semantic.reranker.get_reranker", - fake_get_reranker, - ) - - config = Config( - data_dir=tmp_path / "legacy", - enable_reranking=True, - enable_cross_encoder_rerank=True, - reranker_backend="legacy", - reranker_model="dummy-model", - ) - engine = HybridSearchEngine(config=config) - - reranker = engine._get_cross_encoder_reranker() - assert reranker is sentinel - assert calls["check_backend"] == "legacy" - - get_args = calls["get_args"] - assert isinstance(get_args, dict) - assert get_args["backend"] == "legacy" - assert get_args["model_name"] == "dummy-model" - assert get_args["device"] is None - - -def test_get_cross_encoder_reranker_uses_factory_backend_onnx_gpu_flag( - monkeypatch: pytest.MonkeyPatch, - tmp_path, -) -> None: - calls: dict[str, object] = {} - - def fake_check_reranker_available(backend: str): - calls["check_backend"] = backend - return True, None - - sentinel = object() - - def fake_get_reranker(*, backend: str, model_name=None, device=None, **kwargs): - calls["get_args"] = { - "backend": backend, - "model_name": model_name, - "device": device, - "kwargs": kwargs, - } - return sentinel - - monkeypatch.setattr( - "codexlens.semantic.reranker.check_reranker_available", - fake_check_reranker_available, - ) - monkeypatch.setattr( - "codexlens.semantic.reranker.get_reranker", - fake_get_reranker, - ) - - config = Config( - data_dir=tmp_path / "onnx", - enable_reranking=True, - enable_cross_encoder_rerank=True, - reranker_backend="onnx", - embedding_use_gpu=True, - reranker_use_gpu=False, - ) - engine = HybridSearchEngine(config=config) - - reranker = engine._get_cross_encoder_reranker() - assert reranker is sentinel - assert calls["check_backend"] == "onnx" - - get_args = calls["get_args"] - assert isinstance(get_args, dict) - assert get_args["backend"] == "onnx" - assert get_args["model_name"] is None - assert get_args["device"] is None - assert get_args["kwargs"]["use_gpu"] is False - - -def test_get_cross_encoder_reranker_uses_cpu_device_for_legacy_when_reranker_gpu_disabled( - monkeypatch: pytest.MonkeyPatch, - tmp_path, -) -> None: - calls: dict[str, object] = {} - - def fake_check_reranker_available(backend: str): - calls["check_backend"] = backend - return True, None - - sentinel = object() - - def fake_get_reranker(*, backend: str, model_name=None, device=None, **kwargs): - calls["get_args"] = { - "backend": backend, - "model_name": model_name, - "device": device, - "kwargs": kwargs, - } - return sentinel - - monkeypatch.setattr( - "codexlens.semantic.reranker.check_reranker_available", - fake_check_reranker_available, - ) - monkeypatch.setattr( - "codexlens.semantic.reranker.get_reranker", - fake_get_reranker, - ) - - config = Config( - data_dir=tmp_path / "legacy-cpu", - enable_reranking=True, - enable_cross_encoder_rerank=True, - reranker_backend="legacy", - reranker_model="dummy-model", - embedding_use_gpu=True, - reranker_use_gpu=False, - ) - engine = HybridSearchEngine(config=config) - - reranker = engine._get_cross_encoder_reranker() - assert reranker is sentinel - assert calls["check_backend"] == "legacy" - - get_args = calls["get_args"] - assert isinstance(get_args, dict) - assert get_args["backend"] == "legacy" - assert get_args["model_name"] == "dummy-model" - assert get_args["device"] == "cpu" - - -def test_get_cross_encoder_reranker_returns_none_when_backend_unavailable( - monkeypatch: pytest.MonkeyPatch, - tmp_path, -) -> None: - def fake_check_reranker_available(backend: str): - return False, "missing deps" - - def fake_get_reranker(*args, **kwargs): - raise AssertionError("get_reranker should not be called when backend is unavailable") - - monkeypatch.setattr( - "codexlens.semantic.reranker.check_reranker_available", - fake_check_reranker_available, - ) - monkeypatch.setattr( - "codexlens.semantic.reranker.get_reranker", - fake_get_reranker, - ) - - config = Config( - data_dir=tmp_path / "unavailable", - enable_reranking=True, - enable_cross_encoder_rerank=True, - reranker_backend="onnx", - ) - engine = HybridSearchEngine(config=config) - - assert engine._get_cross_encoder_reranker() is None diff --git a/codex-lens/tests/test_hybrid_search_unit.py b/codex-lens/tests/test_hybrid_search_unit.py deleted file mode 100644 index 5c485291..00000000 --- a/codex-lens/tests/test_hybrid_search_unit.py +++ /dev/null @@ -1,635 +0,0 @@ -"""Unit tests for HybridSearchEngine - parallel search and RRF fusion. - -Tests cover: -- search: exact only, fuzzy enabled, vector enabled, pure vector mode -- search: RRF fusion, empty query, no results, reranking, category filtering -- _search_parallel: parallel backend execution -- _search_lsp_graph: LSP graph expansion with seeds, vector-to-FTS fallback -""" - -from __future__ import annotations - -import tempfile -from pathlib import Path -from typing import Dict, List -from unittest.mock import MagicMock, Mock, patch, PropertyMock - -import pytest - -from codexlens.config import Config -from codexlens.entities import SearchResult -from codexlens.search.hybrid_search import HybridSearchEngine - - -# ============================================================================= -# Test Fixtures -# ============================================================================= - - -@pytest.fixture -def temp_paths(): - """Create temporary directory structure with a mock index.""" - tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) - root = Path(tmpdir.name) - # Create a non-empty index file to pass the empty-file guard - index_path = root / "_index.db" - index_path.write_bytes(b"\x00" * 100) - yield root - try: - tmpdir.cleanup() - except (PermissionError, OSError): - pass - - -@pytest.fixture -def mock_config(): - """Create mock config for hybrid search.""" - config = MagicMock(spec=Config) - config.embedding_use_gpu = False - config.enable_reranking = False - config.enable_cross_encoder_rerank = False - config.symbol_boost_factor = 1.5 - config.fusion_method = "rrf" - config.rrf_k = 60 - config.enable_category_filter = True - return config - - -@pytest.fixture -def sample_results() -> List[SearchResult]: - """Create sample search results.""" - return [ - SearchResult( - path="auth.py", - score=0.9, - excerpt="def authenticate(user):", - symbol_name="authenticate", - symbol_kind="function", - ), - SearchResult( - path="login.py", - score=0.7, - excerpt="class LoginHandler:", - symbol_name="LoginHandler", - symbol_kind="class", - ), - ] - - -# ============================================================================= -# Tests: search with different backends -# ============================================================================= - - -class TestHybridSearchBackends: - """Tests for HybridSearchEngine.search() backend configurations.""" - - def test_search_exact_only(self, temp_paths, mock_config): - """Search with only exact FTS backend.""" - engine = HybridSearchEngine(config=mock_config) - index_path = temp_paths / "_index.db" - - with patch.object(engine, "_search_parallel") as mock_parallel: - mock_parallel.return_value = { - "exact": [ - SearchResult(path="a.py", score=10.0, excerpt="result"), - ] - } - - results = engine.search( - index_path, "test query", - enable_fuzzy=False, enable_vector=False, - ) - - assert len(results) == 1 - # Verify only exact backend was requested - call_args = mock_parallel.call_args - backends = call_args[0][2] # 3rd positional arg - assert "exact" in backends - assert "fuzzy" not in backends - assert "vector" not in backends - - def test_search_fuzzy_enabled(self, temp_paths, mock_config): - """Search with exact + fuzzy backends.""" - engine = HybridSearchEngine(config=mock_config) - index_path = temp_paths / "_index.db" - - with patch.object(engine, "_search_parallel") as mock_parallel: - mock_parallel.return_value = { - "exact": [SearchResult(path="a.py", score=10.0, excerpt="exact")], - "fuzzy": [SearchResult(path="b.py", score=8.0, excerpt="fuzzy")], - } - - results = engine.search( - index_path, "test_query", - enable_fuzzy=True, enable_vector=False, - ) - - assert len(results) >= 1 - backends = mock_parallel.call_args[0][2] - assert "exact" in backends - assert "fuzzy" in backends - - def test_search_vector_enabled(self, temp_paths, mock_config): - """Search with exact + fuzzy + vector backends.""" - engine = HybridSearchEngine(config=mock_config) - index_path = temp_paths / "_index.db" - - with patch.object(engine, "_search_parallel") as mock_parallel: - mock_parallel.return_value = { - "exact": [SearchResult(path="a.py", score=10.0, excerpt="exact")], - "vector": [SearchResult(path="c.py", score=0.85, excerpt="vector")], - } - - results = engine.search( - index_path, "test_query", - enable_fuzzy=False, enable_vector=True, - ) - - backends = mock_parallel.call_args[0][2] - assert "exact" in backends - assert "vector" in backends - - def test_search_lexical_priority_query_skips_vector_backend(self, temp_paths, mock_config): - """Config/env/factory queries should stay lexical-first in hybrid mode.""" - engine = HybridSearchEngine(config=mock_config) - index_path = temp_paths / "_index.db" - - with patch.object(engine, "_search_parallel") as mock_parallel: - mock_parallel.return_value = { - "exact": [SearchResult(path="config.py", score=10.0, excerpt="exact")], - "fuzzy": [SearchResult(path="env_config.py", score=8.0, excerpt="fuzzy")], - } - - results = engine.search( - index_path, - "embedding backend fastembed local litellm api config", - enable_fuzzy=True, - enable_vector=True, - ) - - assert len(results) >= 1 - backends = mock_parallel.call_args[0][2] - assert "exact" in backends - assert "fuzzy" in backends - assert "vector" not in backends - - def test_search_pure_vector(self, temp_paths, mock_config): - """Pure vector mode should only use vector backend.""" - engine = HybridSearchEngine(config=mock_config) - mock_config.enable_category_filter = False - index_path = temp_paths / "_index.db" - - with patch.object(engine, "_search_parallel") as mock_parallel: - mock_parallel.return_value = { - "vector": [SearchResult(path="a.py", score=0.9, excerpt="vec")], - } - - results = engine.search( - index_path, "semantic query", - enable_vector=True, pure_vector=True, - ) - - backends = mock_parallel.call_args[0][2] - assert "vector" in backends - assert "exact" not in backends - - -# ============================================================================= -# Tests: search fusion and post-processing -# ============================================================================= - - -class TestHybridSearchFusion: - """Tests for RRF fusion, empty query, no results, reranking, filtering.""" - - def test_search_rrf_fusion(self, temp_paths, mock_config): - """Results from multiple backends should be fused via RRF.""" - engine = HybridSearchEngine(config=mock_config) - index_path = temp_paths / "_index.db" - - with patch.object(engine, "_search_parallel") as mock_parallel: - mock_parallel.return_value = { - "exact": [ - SearchResult(path="a.py", score=10.0, excerpt="exact a"), - SearchResult(path="b.py", score=5.0, excerpt="exact b"), - ], - "vector": [ - SearchResult(path="b.py", score=0.9, excerpt="vector b"), - SearchResult(path="c.py", score=0.8, excerpt="vector c"), - ], - } - - results = engine.search( - index_path, "test", - enable_fuzzy=False, enable_vector=True, - ) - - # b.py appears in both sources - should have high fusion score - assert any(r.path == "b.py" for r in results) - - def test_search_empty_query(self, temp_paths, mock_config): - """Empty query should still execute (handled gracefully).""" - engine = HybridSearchEngine(config=mock_config) - index_path = temp_paths / "_index.db" - - with patch.object(engine, "_search_parallel") as mock_parallel: - mock_parallel.return_value = {"exact": []} - - results = engine.search(index_path, "", enable_fuzzy=False) - - assert results == [] - - def test_search_no_results(self, temp_paths, mock_config): - """All backends returning empty should produce empty results.""" - engine = HybridSearchEngine(config=mock_config) - index_path = temp_paths / "_index.db" - - with patch.object(engine, "_search_parallel") as mock_parallel: - mock_parallel.return_value = { - "exact": [], - "fuzzy": [], - } - - results = engine.search(index_path, "nonexistent") - - assert results == [] - - def test_search_reranking(self, temp_paths, mock_config): - """Reranking should be applied when config enables it.""" - mock_config.enable_reranking = True - mock_config.enable_cross_encoder_rerank = False - mock_config.reranking_top_k = 50 - engine = HybridSearchEngine(config=mock_config) - index_path = temp_paths / "_index.db" - - mock_embedder = MagicMock() - mock_embedder.embed_single.return_value = [0.1] * 128 - mock_embedder.embed.return_value = [[0.1] * 128] - engine.embedder = mock_embedder - - with patch.object(engine, "_search_parallel") as mock_parallel: - mock_parallel.return_value = { - "exact": [SearchResult(path="a.py", score=10.0, excerpt="code")], - } - - with patch("codexlens.search.hybrid_search.rerank_results") as mock_rerank: - mock_rerank.return_value = [ - SearchResult(path="a.py", score=0.85, excerpt="code"), - ] - results = engine.search(index_path, "query", enable_fuzzy=False) - - mock_rerank.assert_called_once() - - def test_search_lexical_priority_query_skips_expensive_reranking(self, temp_paths, mock_config): - """Lexical-priority queries should bypass embedder and cross-encoder reranking.""" - mock_config.enable_reranking = True - mock_config.enable_cross_encoder_rerank = True - mock_config.reranking_top_k = 50 - mock_config.reranker_top_k = 20 - engine = HybridSearchEngine(config=mock_config) - index_path = temp_paths / "_index.db" - - with patch.object(engine, "_search_parallel") as mock_parallel: - mock_parallel.return_value = { - "exact": [SearchResult(path="config.py", score=10.0, excerpt="code")], - "fuzzy": [SearchResult(path="env_config.py", score=9.0, excerpt="env vars")], - } - - with patch("codexlens.search.hybrid_search.rerank_results") as mock_rerank, patch( - "codexlens.search.hybrid_search.cross_encoder_rerank" - ) as mock_cross_encoder, patch.object( - engine, - "_get_cross_encoder_reranker", - ) as mock_get_reranker: - results = engine.search( - index_path, - "get_reranker factory onnx backend selection", - enable_fuzzy=True, - enable_vector=True, - ) - - assert len(results) >= 1 - mock_rerank.assert_not_called() - mock_cross_encoder.assert_not_called() - mock_get_reranker.assert_not_called() - - def test_search_category_filtering(self, temp_paths, mock_config): - """Category filtering should separate code/doc results by intent.""" - mock_config.enable_category_filter = True - engine = HybridSearchEngine(config=mock_config) - index_path = temp_paths / "_index.db" - - with patch.object(engine, "_search_parallel") as mock_parallel: - mock_parallel.return_value = { - "exact": [ - SearchResult(path="auth.py", score=10.0, excerpt="def auth"), - SearchResult(path="README.md", score=8.0, excerpt="docs"), - ], - } - - # Keyword-like query should filter to code - results = engine.search( - index_path, "AuthManager", - enable_fuzzy=False, - ) - - paths = [r.path for r in results] - # Code files should remain, doc files filtered for KEYWORD intent - assert "auth.py" in paths - - -# ============================================================================= -# Tests: _search_parallel -# ============================================================================= - - -class TestSearchParallel: - """Tests for _search_parallel() parallel backend execution.""" - - def test_search_parallel_backends(self, temp_paths, mock_config): - """Parallel execution should run all requested backends.""" - engine = HybridSearchEngine(config=mock_config) - index_path = temp_paths / "_index.db" - - with patch.object(engine, "_search_exact") as mock_exact, \ - patch.object(engine, "_search_fuzzy") as mock_fuzzy: - mock_exact.return_value = [ - SearchResult(path="a.py", score=10.0, excerpt="exact"), - ] - mock_fuzzy.return_value = [ - SearchResult(path="b.py", score=8.0, excerpt="fuzzy"), - ] - - results_map = engine._search_parallel( - index_path, "query", - backends={"exact": True, "fuzzy": True}, - limit=10, - ) - - assert "exact" in results_map - assert "fuzzy" in results_map - mock_exact.assert_called_once() - mock_fuzzy.assert_called_once() - - -class TestCentralizedMetadataFetch: - """Tests for centralized metadata retrieval helpers.""" - - def test_fetch_from_vector_meta_store_clamps_negative_scores(self, temp_paths, mock_config, monkeypatch): - engine = HybridSearchEngine(config=mock_config) - - class FakeMetaStore: - def __init__(self, _path): - pass - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc, tb): - return False - - def get_chunks_by_ids(self, _chunk_ids, category=None): - assert category is None - return [ - { - "chunk_id": 7, - "file_path": "src/app.py", - "content": "def app(): pass", - "metadata": {}, - "start_line": 1, - "end_line": 1, - } - ] - - import codexlens.storage.vector_meta_store as vector_meta_store - - monkeypatch.setattr(vector_meta_store, "VectorMetadataStore", FakeMetaStore) - - results = engine._fetch_from_vector_meta_store( - temp_paths / "_vectors_meta.db", - [7], - {7: -0.01}, - ) - - assert len(results) == 1 - assert results[0].path == "src/app.py" - assert results[0].score == 0.0 - - -class TestCentralizedVectorCaching: - """Tests for centralized vector search runtime caches.""" - - def test_search_vector_centralized_reuses_cached_resources( - self, - temp_paths, - mock_config, - ): - engine = HybridSearchEngine(config=mock_config) - hnsw_path = temp_paths / "_vectors.hnsw" - hnsw_path.write_bytes(b"hnsw") - - vector_store_opened: List[Path] = [] - - class FakeVectorStore: - def __init__(self, path): - vector_store_opened.append(Path(path)) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc, tb): - return False - - def get_model_config(self): - return { - "backend": "fastembed", - "model_name": "BAAI/bge-small-en-v1.5", - "model_profile": "fast", - "embedding_dim": 384, - } - - class FakeEmbedder: - embedding_dim = 384 - - def __init__(self): - self.embed_calls: List[str] = [] - - def embed_single(self, query): - self.embed_calls.append(query) - return [0.1, 0.2, 0.3] - - class FakeAnnIndex: - def __init__(self): - self.load_calls = 0 - self.search_calls = 0 - - def load(self): - self.load_calls += 1 - return True - - def count(self): - return 3 - - def search(self, _query_vec, top_k): - self.search_calls += 1 - assert top_k == 10 - return [7], [0.2] - - fake_embedder = FakeEmbedder() - fake_ann_index = FakeAnnIndex() - - with patch("codexlens.semantic.vector_store.VectorStore", FakeVectorStore), patch( - "codexlens.semantic.factory.get_embedder", - return_value=fake_embedder, - ) as mock_get_embedder, patch( - "codexlens.semantic.ann_index.ANNIndex.create_central", - return_value=fake_ann_index, - ) as mock_create_central, patch.object( - engine, - "_fetch_chunks_by_ids_centralized", - return_value=[SearchResult(path="src/app.py", score=0.8, excerpt="hit")], - ) as mock_fetch: - first = engine._search_vector_centralized( - temp_paths / "child-a" / "_index.db", - hnsw_path, - "smart search routing", - limit=5, - ) - second = engine._search_vector_centralized( - temp_paths / "child-b" / "_index.db", - hnsw_path, - "smart search routing", - limit=5, - ) - - assert [result.path for result in first] == ["src/app.py"] - assert [result.path for result in second] == ["src/app.py"] - assert vector_store_opened == [temp_paths / "_index.db"] - assert mock_get_embedder.call_count == 1 - assert mock_create_central.call_count == 1 - assert fake_ann_index.load_calls == 1 - assert fake_embedder.embed_calls == ["smart search routing"] - assert fake_ann_index.search_calls == 2 - assert mock_fetch.call_count == 2 - - def test_search_vector_centralized_respects_embedding_use_gpu( - self, - temp_paths, - mock_config, - ): - engine = HybridSearchEngine(config=mock_config) - hnsw_path = temp_paths / "_vectors.hnsw" - hnsw_path.write_bytes(b"hnsw") - - class FakeVectorStore: - def __init__(self, _path): - pass - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc, tb): - return False - - def get_model_config(self): - return { - "backend": "fastembed", - "model_name": "BAAI/bge-small-en-v1.5", - "model_profile": "code", - "embedding_dim": 384, - } - - class FakeEmbedder: - embedding_dim = 384 - - def embed_single(self, _query): - return [0.1, 0.2] - - class FakeAnnIndex: - def load(self): - return True - - def count(self): - return 1 - - def search(self, _query_vec, top_k): - assert top_k == 6 - return [9], [0.1] - - with patch("codexlens.semantic.vector_store.VectorStore", FakeVectorStore), patch( - "codexlens.semantic.factory.get_embedder", - return_value=FakeEmbedder(), - ) as mock_get_embedder, patch( - "codexlens.semantic.ann_index.ANNIndex.create_central", - return_value=FakeAnnIndex(), - ), patch.object( - engine, - "_fetch_chunks_by_ids_centralized", - return_value=[SearchResult(path="src/app.py", score=0.9, excerpt="hit")], - ): - results = engine._search_vector_centralized( - temp_paths / "_index.db", - hnsw_path, - "semantic query", - limit=3, - ) - - assert len(results) == 1 - assert mock_get_embedder.call_count == 1 - assert mock_get_embedder.call_args.kwargs == { - "backend": "fastembed", - "profile": "code", - "use_gpu": False, - } - - -# ============================================================================= -# Tests: _search_lsp_graph -# ============================================================================= - - -class TestSearchLspGraph: - """Tests for _search_lsp_graph() LSP graph expansion.""" - - def test_search_lsp_graph(self, temp_paths, mock_config): - """LSP graph search should use seed results for expansion.""" - engine = HybridSearchEngine(config=mock_config) - index_path = temp_paths / "_index.db" - - # When HAS_LSP is False, _search_lsp_graph returns [] - with patch("codexlens.search.hybrid_search.HAS_LSP", False): - results = engine._search_lsp_graph( - index_path, "auth function", limit=5, - ) - assert results == [] - - def test_lsp_fallback_vector_to_fts(self, temp_paths, mock_config): - """When vector seeds fail, should fall back to FTS seeds.""" - engine = HybridSearchEngine(config=mock_config) - index_path = temp_paths / "_index.db" - - with patch("codexlens.search.hybrid_search.HAS_LSP", True): - # Mock _search_vector to return empty (no seeds from vector) - with patch.object(engine, "_search_vector", return_value=[]): - # Mock _search_exact to return seeds - with patch.object(engine, "_search_exact") as mock_exact: - mock_exact.return_value = [ - SearchResult( - path="auth.py", score=10.0, - excerpt="def auth():", symbol_name="auth", - start_line=1, end_line=5, - ), - ] - - # Mock the LSP bridge (will fail on import or async) - # The function should attempt FTS fallback before LSP expansion - try: - results = engine._search_lsp_graph( - index_path, "auth", limit=5, - ) - except Exception: - pass # LSP deps may not be available, but FTS fallback was attempted - - # Verify FTS was called as fallback - mock_exact.assert_called_once() diff --git a/codex-lens/tests/test_incremental_indexer.py b/codex-lens/tests/test_incremental_indexer.py deleted file mode 100644 index 34515487..00000000 --- a/codex-lens/tests/test_incremental_indexer.py +++ /dev/null @@ -1,125 +0,0 @@ -"""Incremental Indexer File Event Processing Tests. - -This module tests the file event processing in the incremental indexer, -covering all file system event types (CREATED, MODIFIED, DELETED, MOVED). - -Test Coverage: -- CREATED events: New files being indexed -- MODIFIED events: Changed files being re-indexed -- DELETED events: Removed files being handled -- MOVED events: File renames being tracked -- Batch processing of multiple events -""" - -import pytest -from pathlib import Path -from unittest.mock import Mock, patch, MagicMock -import tempfile -import shutil - - -class TestCreatedEvents: - """Test handling of CREATED file events.""" - - def test_new_file_indexed(self): - """Test that newly created files are properly indexed.""" - pytest.skip("Requires incremental indexer fixture") - - def test_created_in_subdirectory(self): - """Test that files created in subdirectories are indexed.""" - pytest.skip("Requires incremental indexer fixture") - - def test_batch_created_events(self): - """Test handling multiple files created simultaneously.""" - pytest.skip("Requires incremental indexer fixture") - - -class TestModifiedEvents: - """Test handling of MODIFIED file events.""" - - def test_file_content_updated(self): - """Test that file content changes trigger re-indexing.""" - pytest.skip("Requires incremental indexer fixture") - - def test_metadata_only_change(self): - """Test handling of metadata-only changes (permissions, etc).""" - pytest.skip("Requires incremental indexer fixture") - - def test_rapid_modifications(self): - """Test handling of rapid successive modifications to same file.""" - pytest.skip("Requires incremental indexer fixture") - - -class TestDeletedEvents: - """Test handling of DELETED file events.""" - - def test_file_removed_from_index(self): - """Test that deleted files are removed from the index.""" - pytest.skip("Requires incremental indexer fixture") - - def test_directory_deleted(self): - """Test handling of directory deletion events.""" - pytest.skip("Requires incremental indexer fixture") - - def test_delete_non_indexed_file(self): - """Test handling deletion of files that were never indexed.""" - pytest.skip("Requires incremental indexer fixture") - - -class TestMovedEvents: - """Test handling of MOVED/RENAMED file events.""" - - def test_file_renamed(self): - """Test that renamed files are tracked in the index.""" - pytest.skip("Requires incremental indexer fixture") - - def test_file_moved_to_subdirectory(self): - """Test that files moved to subdirectories are tracked.""" - pytest.skip("Requires incremental indexer fixture") - - def test_file_moved_out_of_watch_root(self): - """Test handling of files moved outside the watch directory.""" - pytest.skip("Requires incremental indexer fixture") - - def test_directory_renamed(self): - """Test handling of directory rename events.""" - pytest.skip("Requires incremental indexer fixture") - - -class TestEventBatching: - """Test batching and deduplication of file events.""" - - def test_duplicate_events_deduplicated(self): - """Test that duplicate events for the same file are deduplicated.""" - pytest.skip("Requires incremental indexer fixture") - - def test_event_ordering_preserved(self): - """Test that events are processed in the correct order.""" - pytest.skip("Requires incremental indexer fixture") - - def test_mixed_event_types_batch(self): - """Test handling a batch with mixed event types.""" - pytest.skip("Requires incremental indexer fixture") - - -class TestErrorHandling: - """Test error handling in file event processing.""" - - def test_unreadable_file_skipped(self): - """Test that unreadable files are handled gracefully.""" - pytest.skip("Requires incremental indexer fixture") - - def test_corrupted_event_continues(self): - """Test that processing continues after a corrupted event.""" - pytest.skip("Requires incremental indexer fixture") - - def test_indexer_error_recovery(self): - """Test recovery from indexer errors during event processing.""" - pytest.skip("Requires incremental indexer fixture") - - -# TODO: Implement actual tests using pytest fixtures and the incremental indexer -# The test infrastructure needs: -# - IncrementalIndexer fixture with mock filesystem watcher -# - Temporary directory fixtures for test files -# - Mock event queue for controlled event injection diff --git a/codex-lens/tests/test_incremental_indexing.py b/codex-lens/tests/test_incremental_indexing.py deleted file mode 100644 index dceffb76..00000000 --- a/codex-lens/tests/test_incremental_indexing.py +++ /dev/null @@ -1,512 +0,0 @@ -"""Tests for incremental indexing with mtime tracking (P2). - -Tests mtime-based skip logic, deleted file cleanup, and incremental update workflows. -""" - -import os -import sqlite3 -import tempfile -import time -from datetime import datetime, timedelta -from pathlib import Path - -import pytest - -from codexlens.storage.dir_index import DirIndexStore - -# Check if pytest-benchmark is available -try: - import pytest_benchmark - BENCHMARK_AVAILABLE = True -except ImportError: - BENCHMARK_AVAILABLE = False - - -class TestMtimeTracking: - """Tests for mtime-based file change detection.""" - - @pytest.fixture - def temp_db(self): - """Create temporary database.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - yield db_path - if db_path.exists(): - db_path.unlink() - - @pytest.fixture - def temp_dir(self): - """Create temporary directory with test files.""" - with tempfile.TemporaryDirectory() as tmpdir: - temp_path = Path(tmpdir) - - # Create test files - (temp_path / "file1.py").write_text("def function1(): pass") - (temp_path / "file2.py").write_text("def function2(): pass") - (temp_path / "file3.js").write_text("function test() {}") - - yield temp_path - - @pytest.fixture - def index_store(self, temp_db): - """Create DirIndexStore instance.""" - store = DirIndexStore(temp_db) - store.initialize() - yield store - store.close() - - def test_files_table_has_mtime_column(self, index_store): - """Test files table includes mtime column for tracking.""" - with index_store._get_connection() as conn: - cursor = conn.execute("PRAGMA table_info(files)") - columns = {row[1]: row[2] for row in cursor.fetchall()} - assert "mtime" in columns or "indexed_at" in columns, \ - "Should have mtime or indexed_at for change detection" - - def test_needs_reindex_new_file(self, index_store, temp_dir): - """Test needs_reindex returns True for new files.""" - file_path = temp_dir / "file1.py" - file_mtime = file_path.stat().st_mtime - - # New file should need indexing - needs_update = self._check_needs_reindex(index_store, str(file_path), file_mtime) - assert needs_update is True, "New file should need indexing" - - def test_needs_reindex_unchanged_file(self, index_store, temp_dir): - """Test needs_reindex returns False for unchanged files.""" - file_path = temp_dir / "file1.py" - file_mtime = file_path.stat().st_mtime - content = file_path.read_text() - - # Index the file - with index_store._get_connection() as conn: - name = file_path.name - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, str(file_path), content, "python", file_mtime) - ) - conn.commit() - - # Unchanged file should not need reindexing - needs_update = self._check_needs_reindex(index_store, str(file_path), file_mtime) - assert needs_update is False, "Unchanged file should not need reindexing" - - def test_needs_reindex_modified_file(self, index_store, temp_dir): - """Test needs_reindex returns True for modified files.""" - file_path = temp_dir / "file1.py" - original_mtime = file_path.stat().st_mtime - content = file_path.read_text() - - # Index the file - with index_store._get_connection() as conn: - name = file_path.name - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, str(file_path), content, "python", original_mtime) - ) - conn.commit() - - # Modify the file (update mtime) - time.sleep(0.1) # Ensure mtime changes - file_path.write_text("def modified_function(): pass") - new_mtime = file_path.stat().st_mtime - - # Modified file should need reindexing - needs_update = self._check_needs_reindex(index_store, str(file_path), new_mtime) - assert needs_update is True, "Modified file should need reindexing" - assert new_mtime > original_mtime, "Mtime should have increased" - - def _check_needs_reindex(self, index_store, file_path: str, file_mtime: float) -> bool: - """Helper to check if file needs reindexing.""" - with index_store._get_connection() as conn: - cursor = conn.execute( - "SELECT mtime FROM files WHERE full_path = ?", - (file_path,) - ) - result = cursor.fetchone() - - if result is None: - return True # New file - - stored_mtime = result[0] - return file_mtime > stored_mtime - - -class TestIncrementalUpdate: - """Tests for incremental update workflows.""" - - @pytest.fixture - def temp_db(self): - """Create temporary database.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - yield db_path - if db_path.exists(): - db_path.unlink() - - @pytest.fixture - def temp_dir(self): - """Create temporary directory with test files.""" - with tempfile.TemporaryDirectory() as tmpdir: - temp_path = Path(tmpdir) - - # Create initial files - for i in range(10): - (temp_path / f"file{i}.py").write_text(f"def function{i}(): pass") - - yield temp_path - - @pytest.fixture - def index_store(self, temp_db): - """Create DirIndexStore instance.""" - store = DirIndexStore(temp_db) - store.initialize() - yield store - store.close() - - def test_incremental_skip_rate(self, index_store, temp_dir): - """Test incremental indexing achieves ≥90% skip rate on unchanged files.""" - # First indexing pass - index all files - files_indexed_first = self._index_directory(index_store, temp_dir) - assert files_indexed_first == 10, "Should index all 10 files initially" - - # Second pass without modifications - should skip most files - files_indexed_second = self._index_directory(index_store, temp_dir) - skip_rate = 1.0 - (files_indexed_second / files_indexed_first) - assert skip_rate >= 0.9, f"Skip rate should be ≥90%, got {skip_rate:.1%}" - - def test_incremental_indexes_modified_files(self, index_store, temp_dir): - """Test incremental indexing detects and updates modified files.""" - # Initial indexing - self._index_directory(index_store, temp_dir) - - # Modify 2 files - modified_files = ["file3.py", "file7.py"] - time.sleep(0.1) - for fname in modified_files: - (temp_dir / fname).write_text("def modified(): pass") - - # Re-index - files_indexed = self._index_directory(index_store, temp_dir) - - # Should re-index only modified files - assert files_indexed == len(modified_files), \ - f"Should re-index {len(modified_files)} modified files, got {files_indexed}" - - def test_incremental_indexes_new_files(self, index_store, temp_dir): - """Test incremental indexing detects and indexes new files.""" - # Initial indexing - self._index_directory(index_store, temp_dir) - - # Add new files - new_files = ["new1.py", "new2.py", "new3.py"] - time.sleep(0.1) - for fname in new_files: - (temp_dir / fname).write_text("def new_function(): pass") - - # Re-index - files_indexed = self._index_directory(index_store, temp_dir) - - # Should index new files - assert files_indexed == len(new_files), \ - f"Should index {len(new_files)} new files, got {files_indexed}" - - def _index_directory(self, index_store, directory: Path) -> int: - """Helper to index directory and return count of files indexed.""" - indexed_count = 0 - - for file_path in directory.glob("*.py"): - file_mtime = file_path.stat().st_mtime - content = file_path.read_text() - - # Check if needs indexing - with index_store._get_connection() as conn: - cursor = conn.execute( - "SELECT mtime FROM files WHERE full_path = ?", - (str(file_path),) - ) - result = cursor.fetchone() - - needs_index = (result is None) or (file_mtime > result[0]) - - if needs_index: - # Insert or update - name = file_path.name - conn.execute( - """INSERT OR REPLACE INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, str(file_path), content, "python", file_mtime) - ) - conn.commit() - indexed_count += 1 - - return indexed_count - - -class TestDeletedFileCleanup: - """Tests for cleanup of deleted files from index.""" - - @pytest.fixture - def temp_db(self): - """Create temporary database.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - yield db_path - if db_path.exists(): - db_path.unlink() - - @pytest.fixture - def index_store(self, temp_db): - """Create DirIndexStore instance.""" - store = DirIndexStore(temp_db) - store.initialize() - yield store - store.close() - - def test_cleanup_deleted_files(self, index_store): - """Test cleanup removes deleted file entries.""" - # Index files that no longer exist - deleted_files = [ - "/deleted/file1.py", - "/deleted/file2.js", - "/deleted/file3.ts" - ] - - with index_store._get_connection() as conn: - for path in deleted_files: - name = path.split('/')[-1] - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, path, "content", "python", time.time()) - ) - conn.commit() - - # Verify files are in index - cursor = conn.execute("SELECT COUNT(*) FROM files") - assert cursor.fetchone()[0] == len(deleted_files) - - # Run cleanup (manually since files don't exist) - deleted_count = self._cleanup_nonexistent_files(index_store, deleted_files) - - assert deleted_count == len(deleted_files), \ - f"Should remove {len(deleted_files)} deleted files" - - # Verify cleanup worked - with index_store._get_connection() as conn: - cursor = conn.execute("SELECT COUNT(*) FROM files WHERE full_path IN (?, ?, ?)", deleted_files) - assert cursor.fetchone()[0] == 0, "Deleted files should be removed from index" - - def test_cleanup_preserves_existing_files(self, index_store): - """Test cleanup preserves entries for existing files.""" - # Create temporary files - with tempfile.TemporaryDirectory() as tmpdir: - temp_path = Path(tmpdir) - existing_files = [ - temp_path / "exists1.py", - temp_path / "exists2.py" - ] - - for fpath in existing_files: - fpath.write_text("content") - - # Index existing and deleted files - all_files = [str(f) for f in existing_files] + ["/deleted/file.py"] - - with index_store._get_connection() as conn: - for path in all_files: - name = path.split('/')[-1] - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, path, "content", "python", time.time()) - ) - conn.commit() - - # Run cleanup - self._cleanup_nonexistent_files(index_store, ["/deleted/file.py"]) - - # Verify existing files preserved - with index_store._get_connection() as conn: - cursor = conn.execute( - "SELECT COUNT(*) FROM files WHERE full_path IN (?, ?)", - [str(f) for f in existing_files] - ) - assert cursor.fetchone()[0] == len(existing_files), \ - "Existing files should be preserved" - - def _cleanup_nonexistent_files(self, index_store, paths_to_check: list) -> int: - """Helper to cleanup nonexistent files.""" - deleted_count = 0 - - with index_store._get_connection() as conn: - for path in paths_to_check: - if not Path(path).exists(): - conn.execute("DELETE FROM files WHERE full_path = ?", (path,)) - deleted_count += 1 - conn.commit() - - return deleted_count - - -class TestMtimeEdgeCases: - """Tests for edge cases in mtime handling.""" - - @pytest.fixture - def temp_db(self): - """Create temporary database.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - yield db_path - if db_path.exists(): - db_path.unlink() - - @pytest.fixture - def index_store(self, temp_db): - """Create DirIndexStore instance.""" - store = DirIndexStore(temp_db) - store.initialize() - yield store - store.close() - - def test_mtime_precision(self, index_store): - """Test mtime comparison handles floating-point precision.""" - file_path = "/test/file.py" - mtime1 = time.time() - mtime2 = mtime1 + 1e-6 # Microsecond difference - - with index_store._get_connection() as conn: - name = file_path.split('/')[-1] - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, file_path, "content", "python", mtime1) - ) - conn.commit() - - # Check if mtime2 is considered newer - cursor = conn.execute("SELECT mtime FROM files WHERE full_path = ?", (file_path,)) - stored_mtime = cursor.fetchone()[0] - - # Should handle precision correctly - assert isinstance(stored_mtime, (int, float)) - - def test_mtime_null_handling(self, index_store): - """Test handling of NULL mtime values (legacy data).""" - file_path = "/test/legacy.py" - - with index_store._get_connection() as conn: - # Insert file without mtime (legacy) - use NULL - name = file_path.split('/')[-1] - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, NULL)""", - (name, file_path, "content", "python") - ) - conn.commit() - - # Query should handle NULL mtime gracefully - cursor = conn.execute("SELECT mtime FROM files WHERE full_path = ?", (file_path,)) - result = cursor.fetchone() - # mtime should be NULL or have default value - assert result is not None - - def test_future_mtime_handling(self, index_store): - """Test handling of files with future mtime (clock skew).""" - file_path = "/test/future.py" - future_mtime = time.time() + 86400 # 1 day in future - - with index_store._get_connection() as conn: - name = file_path.split('/')[-1] - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, file_path, "content", "python", future_mtime) - ) - conn.commit() - - # Should store future mtime without errors - cursor = conn.execute("SELECT mtime FROM files WHERE full_path = ?", (file_path,)) - stored_mtime = cursor.fetchone()[0] - assert stored_mtime == future_mtime - - -@pytest.mark.benchmark -class TestIncrementalPerformance: - """Performance benchmarks for incremental indexing.""" - - @pytest.fixture - def large_indexed_db(self): - """Create database with many indexed files.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - store = DirIndexStore(db_path) - store.initialize() - - # Index 1000 files - with store._get_connection() as conn: - current_time = time.time() - for i in range(1000): - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (f"file{i}.py", f"/test/file{i}.py", f"def func{i}(): pass", "python", current_time) - ) - conn.commit() - - yield db_path - store.close() - - if db_path.exists(): - db_path.unlink() - - def test_skip_rate_benchmark(self, large_indexed_db): - """Benchmark skip rate on large dataset.""" - store = DirIndexStore(large_indexed_db) - store.initialize() - - try: - # Simulate incremental pass - skipped = 0 - total = 1000 - current_time = time.time() - - with store._get_connection() as conn: - for i in range(total): - cursor = conn.execute( - "SELECT mtime FROM files WHERE full_path = ?", - (f"/test/file{i}.py",) - ) - result = cursor.fetchone() - - if result and current_time <= result[0] + 1.0: - skipped += 1 - - skip_rate = skipped / total - assert skip_rate >= 0.9, f"Skip rate should be ≥90%, got {skip_rate:.1%}" - finally: - store.close() - - @pytest.mark.skipif(not BENCHMARK_AVAILABLE, reason="pytest-benchmark not installed") - def test_cleanup_performance(self, large_indexed_db, benchmark): - """Benchmark cleanup of deleted files on large dataset.""" - store = DirIndexStore(large_indexed_db) - store.initialize() - - try: - def cleanup_batch(): - with store._get_connection() as conn: - # Delete 100 files - paths = [f"/test/file{i}.py" for i in range(100)] - placeholders = ",".join("?" * len(paths)) - conn.execute(f"DELETE FROM files WHERE full_path IN ({placeholders})", paths) - conn.commit() - - # Should complete in reasonable time - result = benchmark(cleanup_batch) - assert result < 1.0 # Should take <1 second for 100 deletions - finally: - store.close() diff --git a/codex-lens/tests/test_index_status_cli_contract.py b/codex-lens/tests/test_index_status_cli_contract.py deleted file mode 100644 index cac0549c..00000000 --- a/codex-lens/tests/test_index_status_cli_contract.py +++ /dev/null @@ -1,674 +0,0 @@ -import json - -from typer.testing import CliRunner - -import codexlens.cli.commands as commands -from codexlens.cli.commands import app -import codexlens.cli.embedding_manager as embedding_manager -from codexlens.config import Config -from codexlens.entities import SearchResult -from codexlens.search.chain_search import ChainSearchResult, SearchStats - - -def test_index_status_json_preserves_legacy_embeddings_contract( - monkeypatch, - tmp_path, -) -> None: - workspace = tmp_path / "workspace" - workspace.mkdir() - (workspace / "_index.db").touch() - - legacy_summary = { - "total_indexes": 3, - "indexes_with_embeddings": 1, - "total_chunks": 42, - "indexes": [ - { - "project": "child", - "path": str(workspace / "child" / "_index.db"), - "has_embeddings": True, - "total_chunks": 42, - "total_files": 1, - "coverage_percent": 100.0, - } - ], - } - root_status = { - "total_indexes": 3, - "total_files": 2, - "files_with_embeddings": 0, - "files_without_embeddings": 2, - "total_chunks": 0, - "coverage_percent": 0.0, - "indexes_with_embeddings": 1, - "indexes_without_embeddings": 2, - "model_info": None, - "root": { - "index_path": str(workspace / "_index.db"), - "exists": False, - "total_files": 2, - "files_with_embeddings": 0, - "files_without_embeddings": 2, - "total_chunks": 0, - "coverage_percent": 0.0, - "has_embeddings": False, - "storage_mode": "none", - }, - "subtree": { - "total_indexes": 3, - "total_files": 3, - "files_with_embeddings": 1, - "files_without_embeddings": 2, - "total_chunks": 42, - "coverage_percent": 33.3, - "indexes_with_embeddings": 1, - "indexes_without_embeddings": 2, - }, - "centralized": { - "dense_index_exists": False, - "binary_index_exists": False, - "dense_ready": False, - "binary_ready": False, - "usable": False, - "chunk_metadata_rows": 0, - "binary_vector_rows": 0, - "files_with_embeddings": 0, - }, - } - - monkeypatch.setattr( - embedding_manager, - "get_embeddings_status", - lambda _index_root: {"success": True, "result": root_status}, - ) - monkeypatch.setattr( - embedding_manager, - "get_embedding_stats_summary", - lambda _index_root: {"success": True, "result": legacy_summary}, - ) - monkeypatch.setattr( - commands, - "RegistryStore", - type( - "FakeRegistryStore", - (), - { - "initialize": lambda self: None, - "close": lambda self: None, - }, - ), - ) - monkeypatch.setattr( - commands, - "PathMapper", - type( - "FakePathMapper", - (), - { - "source_to_index_db": lambda self, _target_path: workspace / "_index.db", - }, - ), - ) - - runner = CliRunner() - result = runner.invoke(app, ["index", "status", str(workspace), "--json"]) - - assert result.exit_code == 0, result.output - payload = json.loads(result.stdout) - body = payload["result"] - assert body["embeddings"] == legacy_summary - assert body["embeddings_error"] is None - assert body["embeddings_status"] == root_status - assert body["embeddings_status_error"] is None - assert body["embeddings_summary"] == legacy_summary - - -def test_search_json_preserves_dense_rerank_method_label( - monkeypatch, - tmp_path, -) -> None: - workspace = tmp_path / "workspace" - workspace.mkdir() - - search_result = ChainSearchResult( - query="greet function", - results=[ - SearchResult( - path=str(workspace / "src" / "app.py"), - score=0.97, - excerpt="def greet(name):", - content="def greet(name):\n return f'hello {name}'\n", - ) - ], - symbols=[], - stats=SearchStats(dirs_searched=2, files_matched=1, time_ms=12.5), - ) - captured: dict[str, object] = {} - - monkeypatch.setattr(commands.Config, "load", staticmethod(lambda: Config(data_dir=tmp_path / "data"))) - monkeypatch.setattr( - commands, - "RegistryStore", - type( - "FakeRegistryStore", - (), - { - "initialize": lambda self: None, - "close": lambda self: None, - }, - ), - ) - monkeypatch.setattr( - commands, - "PathMapper", - type( - "FakePathMapper", - (), - {}, - ), - ) - - class FakeChainSearchEngine: - def __init__(self, registry, mapper, config=None): - captured["registry"] = registry - captured["mapper"] = mapper - captured["config"] = config - - def search(self, *_args, **_kwargs): - raise AssertionError("dense_rerank should dispatch via cascade_search") - - def cascade_search(self, query, source_path, k=10, options=None, strategy=None): - captured["query"] = query - captured["source_path"] = source_path - captured["limit"] = k - captured["options"] = options - captured["strategy"] = strategy - return search_result - - monkeypatch.setattr(commands, "ChainSearchEngine", FakeChainSearchEngine) - - runner = CliRunner() - result = runner.invoke( - app, - ["search", "greet function", "--path", str(workspace), "--method", "dense_rerank", "--json"], - ) - - assert result.exit_code == 0, result.output - payload = json.loads(result.stdout) - body = payload["result"] - assert body["method"] == "dense_rerank" - assert body["count"] == 1 - assert body["results"][0]["path"] == str(workspace / "src" / "app.py") - assert captured["strategy"] == "dense_rerank" - assert captured["limit"] == 20 - - -def test_search_json_auto_routes_keyword_queries_to_fts( - monkeypatch, - tmp_path, -) -> None: - workspace = tmp_path / "workspace" - workspace.mkdir() - - search_result = ChainSearchResult( - query="windowsHide", - results=[ - SearchResult( - path=str(workspace / "src" / "spawn.ts"), - score=0.91, - excerpt="windowsHide: true", - content="spawn('node', [], { windowsHide: true })", - ) - ], - symbols=[], - stats=SearchStats(dirs_searched=2, files_matched=1, time_ms=8.0), - ) - captured: dict[str, object] = {} - - monkeypatch.setattr(commands.Config, "load", staticmethod(lambda: Config(data_dir=tmp_path / "data"))) - monkeypatch.setattr( - commands, - "RegistryStore", - type("FakeRegistryStore", (), {"initialize": lambda self: None, "close": lambda self: None}), - ) - monkeypatch.setattr(commands, "PathMapper", type("FakePathMapper", (), {})) - - class FakeChainSearchEngine: - def __init__(self, registry, mapper, config=None): - captured["config"] = config - - def search(self, query, source_path, options=None): - captured["query"] = query - captured["source_path"] = source_path - captured["options"] = options - return search_result - - def cascade_search(self, *_args, **_kwargs): - raise AssertionError("auto keyword queries should not dispatch to cascade_search") - - monkeypatch.setattr(commands, "ChainSearchEngine", FakeChainSearchEngine) - - runner = CliRunner() - result = runner.invoke( - app, - ["search", "windowsHide", "--path", str(workspace), "--json"], - ) - - assert result.exit_code == 0, result.output - body = json.loads(result.stdout)["result"] - assert body["method"] == "fts" - assert captured["options"].enable_vector is False - assert captured["options"].hybrid_mode is False - - -def test_search_json_auto_routes_mixed_queries_to_hybrid( - monkeypatch, - tmp_path, -) -> None: - workspace = tmp_path / "workspace" - workspace.mkdir() - - search_result = ChainSearchResult( - query="how does my_function work", - results=[ - SearchResult( - path=str(workspace / "src" / "app.py"), - score=0.81, - excerpt="def my_function():", - content="def my_function():\n return 1\n", - ) - ], - symbols=[], - stats=SearchStats(dirs_searched=2, files_matched=1, time_ms=10.0), - ) - captured: dict[str, object] = {} - - monkeypatch.setattr(commands.Config, "load", staticmethod(lambda: Config(data_dir=tmp_path / "data"))) - monkeypatch.setattr( - commands, - "RegistryStore", - type("FakeRegistryStore", (), {"initialize": lambda self: None, "close": lambda self: None}), - ) - monkeypatch.setattr(commands, "PathMapper", type("FakePathMapper", (), {})) - - class FakeChainSearchEngine: - def __init__(self, registry, mapper, config=None): - captured["config"] = config - - def search(self, query, source_path, options=None): - captured["query"] = query - captured["source_path"] = source_path - captured["options"] = options - return search_result - - def cascade_search(self, *_args, **_kwargs): - raise AssertionError("mixed auto queries should not dispatch to cascade_search") - - monkeypatch.setattr(commands, "ChainSearchEngine", FakeChainSearchEngine) - - runner = CliRunner() - result = runner.invoke( - app, - ["search", "how does my_function work", "--path", str(workspace), "--json"], - ) - - assert result.exit_code == 0, result.output - body = json.loads(result.stdout)["result"] - assert body["method"] == "hybrid" - assert captured["options"].enable_vector is True - assert captured["options"].hybrid_mode is True - assert captured["options"].enable_cascade is False - - -def test_search_json_auto_routes_generated_artifact_queries_to_fts( - monkeypatch, - tmp_path, -) -> None: - workspace = tmp_path / "workspace" - workspace.mkdir() - - search_result = ChainSearchResult( - query="dist bundle output", - results=[ - SearchResult( - path=str(workspace / "dist" / "bundle.js"), - score=0.77, - excerpt="bundle output", - content="console.log('bundle')", - ) - ], - symbols=[], - stats=SearchStats(dirs_searched=2, files_matched=1, time_ms=9.0), - ) - captured: dict[str, object] = {} - - monkeypatch.setattr(commands.Config, "load", staticmethod(lambda: Config(data_dir=tmp_path / "data"))) - monkeypatch.setattr( - commands, - "RegistryStore", - type("FakeRegistryStore", (), {"initialize": lambda self: None, "close": lambda self: None}), - ) - monkeypatch.setattr(commands, "PathMapper", type("FakePathMapper", (), {})) - - class FakeChainSearchEngine: - def __init__(self, registry, mapper, config=None): - captured["config"] = config - - def search(self, query, source_path, options=None): - captured["query"] = query - captured["source_path"] = source_path - captured["options"] = options - return search_result - - def cascade_search(self, *_args, **_kwargs): - raise AssertionError("generated artifact auto queries should not dispatch to cascade_search") - - monkeypatch.setattr(commands, "ChainSearchEngine", FakeChainSearchEngine) - - runner = CliRunner() - result = runner.invoke( - app, - ["search", "dist bundle output", "--path", str(workspace), "--json"], - ) - - assert result.exit_code == 0, result.output - body = json.loads(result.stdout)["result"] - assert body["method"] == "fts" - assert captured["options"].enable_vector is False - assert captured["options"].hybrid_mode is False - - -def test_auto_select_search_method_prefers_fts_for_lexical_config_queries() -> None: - assert commands._auto_select_search_method("embedding backend fastembed local litellm api config") == "fts" - assert commands._auto_select_search_method("get_reranker factory onnx backend selection") == "fts" - assert commands._auto_select_search_method("how to authenticate users safely?") == "dense_rerank" - - -def test_search_json_fts_zero_results_uses_filesystem_fallback( - monkeypatch, - tmp_path, -) -> None: - workspace = tmp_path / "workspace" - workspace.mkdir() - - indexed_result = ChainSearchResult( - query="find_descendant_project_roots", - results=[], - symbols=[], - stats=SearchStats(dirs_searched=3, files_matched=0, time_ms=7.5), - ) - fallback_result = SearchResult( - path=str(workspace / "src" / "registry.py"), - score=1.0, - excerpt="def find_descendant_project_roots(...):", - content=None, - metadata={ - "filesystem_fallback": True, - "backend": "ripgrep-fallback", - "stale_index_suspected": True, - }, - start_line=12, - end_line=12, - ) - captured: dict[str, object] = {"fallback_calls": 0} - - monkeypatch.setattr(commands.Config, "load", staticmethod(lambda: Config(data_dir=tmp_path / "data"))) - monkeypatch.setattr( - commands, - "RegistryStore", - type("FakeRegistryStore", (), {"initialize": lambda self: None, "close": lambda self: None}), - ) - monkeypatch.setattr(commands, "PathMapper", type("FakePathMapper", (), {})) - - class FakeChainSearchEngine: - def __init__(self, registry, mapper, config=None): - captured["config"] = config - - def search(self, query, source_path, options=None): - captured["query"] = query - captured["source_path"] = source_path - captured["options"] = options - return indexed_result - - def cascade_search(self, *_args, **_kwargs): - raise AssertionError("fts zero-result queries should not dispatch to cascade_search") - - def fake_fallback(query, source_path, *, limit, config, code_only=False, exclude_extensions=None): - captured["fallback_calls"] = int(captured["fallback_calls"]) + 1 - captured["fallback_query"] = query - captured["fallback_path"] = source_path - captured["fallback_limit"] = limit - captured["fallback_code_only"] = code_only - captured["fallback_exclude_extensions"] = exclude_extensions - return { - "results": [fallback_result], - "time_ms": 2.5, - "fallback": { - "backend": "ripgrep-fallback", - "stale_index_suspected": True, - "reason": "Indexed FTS search returned no results; filesystem fallback used.", - }, - } - - monkeypatch.setattr(commands, "ChainSearchEngine", FakeChainSearchEngine) - monkeypatch.setattr(commands, "_filesystem_fallback_search", fake_fallback) - - runner = CliRunner() - result = runner.invoke( - app, - ["search", "find_descendant_project_roots", "--method", "fts", "--path", str(workspace), "--json"], - ) - - assert result.exit_code == 0, result.output - body = json.loads(result.stdout)["result"] - assert body["method"] == "fts" - assert body["count"] == 1 - assert body["results"][0]["path"] == str(workspace / "src" / "registry.py") - assert body["results"][0]["excerpt"] == "def find_descendant_project_roots(...):" - assert body["stats"]["files_matched"] == 1 - assert body["stats"]["time_ms"] == 10.0 - assert body["fallback"] == { - "backend": "ripgrep-fallback", - "stale_index_suspected": True, - "reason": "Indexed FTS search returned no results; filesystem fallback used.", - } - assert captured["fallback_calls"] == 1 - assert captured["fallback_query"] == "find_descendant_project_roots" - assert captured["fallback_path"] == workspace - assert captured["fallback_limit"] == 20 - assert captured["options"].enable_vector is False - assert captured["options"].hybrid_mode is False - - -def test_search_json_hybrid_zero_results_does_not_use_filesystem_fallback( - monkeypatch, - tmp_path, -) -> None: - workspace = tmp_path / "workspace" - workspace.mkdir() - - indexed_result = ChainSearchResult( - query="how does my_function work", - results=[], - symbols=[], - stats=SearchStats(dirs_searched=4, files_matched=0, time_ms=11.0), - ) - captured: dict[str, object] = {"fallback_calls": 0} - - monkeypatch.setattr(commands.Config, "load", staticmethod(lambda: Config(data_dir=tmp_path / "data"))) - monkeypatch.setattr( - commands, - "RegistryStore", - type("FakeRegistryStore", (), {"initialize": lambda self: None, "close": lambda self: None}), - ) - monkeypatch.setattr(commands, "PathMapper", type("FakePathMapper", (), {})) - - class FakeChainSearchEngine: - def __init__(self, registry, mapper, config=None): - captured["config"] = config - - def search(self, query, source_path, options=None): - captured["query"] = query - captured["source_path"] = source_path - captured["options"] = options - return indexed_result - - def cascade_search(self, *_args, **_kwargs): - raise AssertionError("hybrid queries should not dispatch to cascade_search") - - def fake_fallback(*_args, **_kwargs): - captured["fallback_calls"] = int(captured["fallback_calls"]) + 1 - return None - - monkeypatch.setattr(commands, "ChainSearchEngine", FakeChainSearchEngine) - monkeypatch.setattr(commands, "_filesystem_fallback_search", fake_fallback) - - runner = CliRunner() - result = runner.invoke( - app, - ["search", "how does my_function work", "--path", str(workspace), "--json"], - ) - - assert result.exit_code == 0, result.output - body = json.loads(result.stdout)["result"] - assert body["method"] == "hybrid" - assert body["count"] == 0 - assert "fallback" not in body - assert body["stats"]["files_matched"] == 0 - assert body["stats"]["time_ms"] == 11.0 - assert captured["fallback_calls"] == 0 - assert captured["options"].enable_vector is True - assert captured["options"].hybrid_mode is True - - -def test_filesystem_fallback_search_prefers_source_definitions_for_keyword_queries( - monkeypatch, - tmp_path, -) -> None: - workspace = tmp_path / "workspace" - workspace.mkdir() - - source_path = workspace / "src" / "registry.py" - test_path = workspace / "tests" / "test_registry.py" - ref_path = workspace / "src" / "chain_search.py" - - match_lines = [ - { - "type": "match", - "data": { - "path": {"text": str(test_path)}, - "lines": {"text": "def test_find_descendant_project_roots_returns_nested_project_roots():\n"}, - "line_number": 12, - }, - }, - { - "type": "match", - "data": { - "path": {"text": str(source_path)}, - "lines": {"text": "def find_descendant_project_roots(self, source_root: Path) -> List[DirMapping]:\n"}, - "line_number": 48, - }, - }, - { - "type": "match", - "data": { - "path": {"text": str(ref_path)}, - "lines": {"text": "descendant_roots = self.registry.find_descendant_project_roots(source_root)\n"}, - "line_number": 91, - }, - }, - ] - - monkeypatch.setattr(commands.shutil, "which", lambda _name: "rg") - monkeypatch.setattr( - commands.subprocess, - "run", - lambda *_args, **_kwargs: type( - "FakeCompletedProcess", - (), - { - "returncode": 0, - "stdout": "\n".join(json.dumps(line) for line in match_lines), - "stderr": "", - }, - )(), - ) - - fallback = commands._filesystem_fallback_search( - "find_descendant_project_roots", - workspace, - limit=5, - config=Config(data_dir=tmp_path / "data"), - ) - - assert fallback is not None - assert fallback["fallback"]["backend"] == "ripgrep-fallback" - assert fallback["results"][0].path == str(source_path) - assert fallback["results"][1].path == str(ref_path) - assert fallback["results"][2].path == str(test_path) - assert fallback["results"][0].score > fallback["results"][1].score > fallback["results"][2].score - - -def test_clean_json_reports_partial_success_when_locked_files_remain( - monkeypatch, - tmp_path, -) -> None: - workspace = tmp_path / "workspace" - project_index = tmp_path / "indexes" / "workspace" - project_index.mkdir(parents=True) - (project_index / "_index.db").write_text("db", encoding="utf-8") - locked_path = project_index / "nested" / "_index.db" - locked_path.parent.mkdir(parents=True) - locked_path.write_text("locked", encoding="utf-8") - - captured: dict[str, object] = {} - - class FakePathMapper: - def __init__(self): - self.index_root = tmp_path / "indexes" - - def source_to_index_dir(self, source_path): - captured["mapped_source"] = source_path - return project_index - - class FakeRegistryStore: - def initialize(self): - captured["registry_initialized"] = True - - def unregister_project(self, source_path): - captured["unregistered_project"] = source_path - return True - - def close(self): - captured["registry_closed"] = True - - def fake_remove_tree(target): - captured["removed_target"] = target - return { - "removed": False, - "partial": True, - "locked_paths": [str(locked_path)], - "remaining_path": str(project_index), - "errors": [], - } - - monkeypatch.setattr(commands, "PathMapper", FakePathMapper) - monkeypatch.setattr(commands, "RegistryStore", FakeRegistryStore) - monkeypatch.setattr(commands, "_remove_tree_best_effort", fake_remove_tree) - - runner = CliRunner() - result = runner.invoke(app, ["clean", str(workspace), "--json"]) - - assert result.exit_code == 0, result.output - payload = json.loads(result.stdout) - body = payload["result"] - assert payload["success"] is True - assert body["cleaned"] == str(workspace.resolve()) - assert body["index_path"] == str(project_index) - assert body["partial"] is True - assert body["locked_paths"] == [str(locked_path)] - assert body["remaining_path"] == str(project_index) - assert captured["registry_initialized"] is True - assert captured["registry_closed"] is True - assert captured["unregistered_project"] == workspace.resolve() - assert captured["removed_target"] == project_index diff --git a/codex-lens/tests/test_index_tree_ignore_dirs.py b/codex-lens/tests/test_index_tree_ignore_dirs.py deleted file mode 100644 index f9c51773..00000000 --- a/codex-lens/tests/test_index_tree_ignore_dirs.py +++ /dev/null @@ -1,295 +0,0 @@ -from __future__ import annotations - -import json -from pathlib import Path -from unittest.mock import MagicMock - -from codexlens.config import Config -from codexlens.storage.dir_index import DirIndexStore -from codexlens.storage.index_tree import DirBuildResult, IndexTreeBuilder -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - - -def _relative_dirs(source_root: Path, dirs_by_depth: dict[int, list[Path]]) -> set[str]: - return { - path.relative_to(source_root).as_posix() - for paths in dirs_by_depth.values() - for path in paths - if path != source_root - } - - -def test_collect_dirs_by_depth_skips_common_build_artifact_dirs(tmp_path: Path) -> None: - src_dir = tmp_path / "src" - src_dir.mkdir() - (src_dir / "app.py").write_text("print('ok')\n", encoding="utf-8") - - for artifact_dir in ["dist", "build", "coverage", ".next", "out", ".turbo", ".parcel-cache", "target"]: - target_dir = tmp_path / artifact_dir - target_dir.mkdir(parents=True, exist_ok=True) - (target_dir / "generated.py").write_text("print('artifact')\n", encoding="utf-8") - - builder = IndexTreeBuilder( - registry=MagicMock(), - mapper=MagicMock(), - config=Config(data_dir=tmp_path / "data"), - incremental=False, - ) - - dirs_by_depth = builder._collect_dirs_by_depth(tmp_path) - discovered_dirs = _relative_dirs(tmp_path, dirs_by_depth) - - assert "src" in discovered_dirs - assert "dist" not in discovered_dirs - assert "build" not in discovered_dirs - assert "coverage" not in discovered_dirs - assert ".next" not in discovered_dirs - assert "out" not in discovered_dirs - assert ".turbo" not in discovered_dirs - assert ".parcel-cache" not in discovered_dirs - assert "target" not in discovered_dirs - - -def test_should_index_dir_ignores_transitive_build_only_subtrees(tmp_path: Path) -> None: - package_dir = tmp_path / "package" - dist_dir = package_dir / "dist" - dist_dir.mkdir(parents=True) - (dist_dir / "bundle.py").write_text("print('compiled')\n", encoding="utf-8") - - builder = IndexTreeBuilder( - registry=MagicMock(), - mapper=MagicMock(), - config=Config(data_dir=tmp_path / "data"), - incremental=False, - ) - - assert builder._should_index_dir(package_dir) is False - - -def test_collect_dirs_by_depth_respects_relative_ignore_patterns_from_config(tmp_path: Path) -> None: - src_dir = tmp_path / "frontend" / "src" - src_dir.mkdir(parents=True) - (src_dir / "app.ts").write_text("export const app = 1\n", encoding="utf-8") - - dist_dir = tmp_path / "frontend" / "dist" - dist_dir.mkdir(parents=True) - (dist_dir / "bundle.ts").write_text("export const bundle = 1\n", encoding="utf-8") - - builder = IndexTreeBuilder( - registry=MagicMock(), - mapper=MagicMock(), - config=Config(data_dir=tmp_path / "data", ignore_patterns=["frontend/dist"]), - incremental=False, - ) - - dirs_by_depth = builder._collect_dirs_by_depth(tmp_path) - discovered_dirs = _relative_dirs(tmp_path, dirs_by_depth) - - assert "frontend/src" in discovered_dirs - assert "frontend/dist" not in discovered_dirs - - -def test_iter_source_files_respects_extension_filters_and_relative_patterns(tmp_path: Path) -> None: - frontend_dir = tmp_path / "frontend" - frontend_dir.mkdir() - (frontend_dir / "app.ts").write_text("export const app = 1\n", encoding="utf-8") - (frontend_dir / "bundle.min.js").write_text("export const bundle = 1\n", encoding="utf-8") - (frontend_dir / "skip.ts").write_text("export const skip = 1\n", encoding="utf-8") - - builder = IndexTreeBuilder( - registry=MagicMock(), - mapper=MagicMock(), - config=Config( - data_dir=tmp_path / "data", - extension_filters=["*.min.js", "frontend/skip.ts"], - ), - incremental=False, - ) - - source_files = builder._iter_source_files(frontend_dir, source_root=tmp_path) - - assert [path.name for path in source_files] == ["app.ts"] - assert builder._should_index_dir(frontend_dir, source_root=tmp_path) is True - - -def test_builder_loads_saved_ignore_and_extension_filters_by_default(tmp_path: Path, monkeypatch) -> None: - codexlens_home = tmp_path / "codexlens-home" - codexlens_home.mkdir() - (codexlens_home / "settings.json").write_text( - json.dumps( - { - "ignore_patterns": ["frontend/dist"], - "extension_filters": ["*.min.js"], - } - ), - encoding="utf-8", - ) - monkeypatch.setenv("CODEXLENS_DATA_DIR", str(codexlens_home)) - - frontend_dir = tmp_path / "frontend" - frontend_dir.mkdir() - dist_dir = frontend_dir / "dist" - dist_dir.mkdir() - (frontend_dir / "app.ts").write_text("export const app = 1\n", encoding="utf-8") - (frontend_dir / "bundle.min.js").write_text("export const bundle = 1\n", encoding="utf-8") - (dist_dir / "compiled.ts").write_text("export const compiled = 1\n", encoding="utf-8") - - builder = IndexTreeBuilder( - registry=MagicMock(), - mapper=MagicMock(), - config=None, - incremental=False, - ) - - source_files = builder._iter_source_files(frontend_dir, source_root=tmp_path) - dirs_by_depth = builder._collect_dirs_by_depth(tmp_path) - discovered_dirs = _relative_dirs(tmp_path, dirs_by_depth) - - assert [path.name for path in source_files] == ["app.ts"] - assert "frontend/dist" not in discovered_dirs - - -def test_prune_stale_project_dirs_removes_ignored_artifact_mappings(tmp_path: Path) -> None: - workspace = tmp_path / "workspace" - src_dir = workspace / "src" - dist_dir = workspace / "dist" - src_dir.mkdir(parents=True) - dist_dir.mkdir(parents=True) - (src_dir / "app.py").write_text("print('ok')\n", encoding="utf-8") - (dist_dir / "bundle.py").write_text("print('artifact')\n", encoding="utf-8") - - mapper = PathMapper(index_root=tmp_path / "indexes") - registry = RegistryStore(db_path=tmp_path / "registry.db") - registry.initialize() - project = registry.register_project(workspace, mapper.source_to_index_dir(workspace)) - registry.register_dir(project.id, workspace, mapper.source_to_index_db(workspace), depth=0) - registry.register_dir(project.id, src_dir, mapper.source_to_index_db(src_dir), depth=1) - registry.register_dir(project.id, dist_dir, mapper.source_to_index_db(dist_dir), depth=1) - - builder = IndexTreeBuilder( - registry=registry, - mapper=mapper, - config=Config(data_dir=tmp_path / "data"), - incremental=False, - ) - - dirs_by_depth = builder._collect_dirs_by_depth(workspace) - pruned = builder._prune_stale_project_dirs( - project_id=project.id, - source_root=workspace, - dirs_by_depth=dirs_by_depth, - ) - - remaining = {mapping.source_path.resolve() for mapping in registry.get_project_dirs(project.id)} - registry.close() - - assert dist_dir.resolve() in pruned - assert workspace.resolve() in remaining - assert src_dir.resolve() in remaining - assert dist_dir.resolve() not in remaining - - -def test_force_full_build_prunes_stale_ignored_mappings(tmp_path: Path) -> None: - workspace = tmp_path / "workspace" - src_dir = workspace / "src" - dist_dir = workspace / "dist" - src_dir.mkdir(parents=True) - dist_dir.mkdir(parents=True) - (src_dir / "app.py").write_text("print('ok')\n", encoding="utf-8") - (dist_dir / "bundle.py").write_text("print('artifact')\n", encoding="utf-8") - - mapper = PathMapper(index_root=tmp_path / "indexes") - registry = RegistryStore(db_path=tmp_path / "registry.db") - registry.initialize() - project = registry.register_project(workspace, mapper.source_to_index_dir(workspace)) - registry.register_dir(project.id, workspace, mapper.source_to_index_db(workspace), depth=0) - registry.register_dir(project.id, dist_dir, mapper.source_to_index_db(dist_dir), depth=1) - - builder = IndexTreeBuilder( - registry=registry, - mapper=mapper, - config=Config( - data_dir=tmp_path / "data", - global_symbol_index_enabled=False, - ), - incremental=False, - ) - - def fake_build_level_parallel( - dirs: list[Path], - languages, - workers, - *, - source_root: Path, - project_id: int, - global_index_db_path: Path, - ) -> list[DirBuildResult]: - return [ - DirBuildResult( - source_path=dir_path, - index_path=mapper.source_to_index_db(dir_path), - files_count=1 if dir_path == src_dir else 0, - symbols_count=0, - subdirs=[], - ) - for dir_path in dirs - ] - - builder._build_level_parallel = fake_build_level_parallel # type: ignore[method-assign] - builder._link_children_to_parent = MagicMock() - - build_result = builder.build(workspace, force_full=True, workers=1) - - remaining = {mapping.source_path.resolve() for mapping in registry.get_project_dirs(project.id)} - registry.close() - - assert build_result.total_dirs == 2 - assert workspace.resolve() in remaining - assert src_dir.resolve() in remaining - assert dist_dir.resolve() not in remaining - - -def test_force_full_build_rewrites_directory_db_and_drops_stale_ignored_subdirs( - tmp_path: Path, -) -> None: - project_root = tmp_path / "project" - src_dir = project_root / "src" - build_dir = project_root / "build" - src_dir.mkdir(parents=True) - build_dir.mkdir(parents=True) - (src_dir / "app.py").write_text("print('ok')\n", encoding="utf-8") - (build_dir / "generated.py").write_text("print('artifact')\n", encoding="utf-8") - - mapper = PathMapper(index_root=tmp_path / "indexes") - registry = RegistryStore(db_path=tmp_path / "registry.db") - registry.initialize() - config = Config( - data_dir=tmp_path / "data", - global_symbol_index_enabled=False, - ) - - root_index_db = mapper.source_to_index_db(project_root) - with DirIndexStore(root_index_db, config=config) as store: - store.register_subdir( - name="build", - index_path=mapper.source_to_index_db(build_dir), - files_count=1, - ) - - builder = IndexTreeBuilder( - registry=registry, - mapper=mapper, - config=config, - incremental=False, - ) - - build_result = builder.build(project_root, force_full=True, workers=1) - - with DirIndexStore(root_index_db, config=config) as store: - subdir_names = [link.name for link in store.get_subdirs()] - - registry.close() - - assert build_result.total_dirs == 2 - assert subdir_names == ["src"] diff --git a/codex-lens/tests/test_litellm_reranker.py b/codex-lens/tests/test_litellm_reranker.py deleted file mode 100644 index 60c843d8..00000000 --- a/codex-lens/tests/test_litellm_reranker.py +++ /dev/null @@ -1,85 +0,0 @@ -"""Tests for LiteLLMReranker (LLM-based reranking).""" - -from __future__ import annotations - -import sys -import types -from dataclasses import dataclass - -import pytest - -from codexlens.semantic.reranker.litellm_reranker import LiteLLMReranker - - -def _install_dummy_ccw_litellm( - monkeypatch: pytest.MonkeyPatch, *, responses: list[str] -) -> None: - @dataclass(frozen=True, slots=True) - class ChatMessage: - role: str - content: str - - class LiteLLMClient: - def __init__(self, model: str = "default", **kwargs) -> None: - self.model = model - self.kwargs = kwargs - self._responses = list(responses) - self.calls: list[list[ChatMessage]] = [] - - def chat(self, messages, **kwargs): - self.calls.append(list(messages)) - content = self._responses.pop(0) if self._responses else "" - return types.SimpleNamespace(content=content) - - dummy = types.ModuleType("ccw_litellm") - dummy.ChatMessage = ChatMessage - dummy.LiteLLMClient = LiteLLMClient - monkeypatch.setitem(sys.modules, "ccw_litellm", dummy) - - -def test_score_pairs_parses_numbers_and_normalizes_scales( - monkeypatch: pytest.MonkeyPatch, -) -> None: - _install_dummy_ccw_litellm(monkeypatch, responses=["0.73", "7", "80"]) - - reranker = LiteLLMReranker(model="dummy") - scores = reranker.score_pairs([("q", "d1"), ("q", "d2"), ("q", "d3")]) - assert scores == pytest.approx([0.73, 0.7, 0.8]) - - -def test_score_pairs_parses_json_score_field(monkeypatch: pytest.MonkeyPatch) -> None: - _install_dummy_ccw_litellm(monkeypatch, responses=['{"score": 0.42}']) - - reranker = LiteLLMReranker(model="dummy") - scores = reranker.score_pairs([("q", "d")]) - assert scores == pytest.approx([0.42]) - - -def test_score_pairs_uses_default_score_on_parse_failure( - monkeypatch: pytest.MonkeyPatch, -) -> None: - _install_dummy_ccw_litellm(monkeypatch, responses=["N/A"]) - - reranker = LiteLLMReranker(model="dummy", default_score=0.123) - scores = reranker.score_pairs([("q", "d")]) - assert scores == pytest.approx([0.123]) - - -def test_rate_limiting_sleeps_between_requests(monkeypatch: pytest.MonkeyPatch) -> None: - _install_dummy_ccw_litellm(monkeypatch, responses=["0.1", "0.2"]) - - reranker = LiteLLMReranker(model="dummy", min_interval_seconds=1.0) - - import codexlens.semantic.reranker.litellm_reranker as litellm_reranker_module - - sleeps: list[float] = [] - times = iter([100.0, 100.0, 100.1, 100.1]) - - monkeypatch.setattr(litellm_reranker_module.time, "monotonic", lambda: next(times)) - monkeypatch.setattr( - litellm_reranker_module.time, "sleep", lambda seconds: sleeps.append(seconds) - ) - - _ = reranker.score_pairs([("q", "d1"), ("q", "d2")]) - assert sleeps == pytest.approx([0.9]) - diff --git a/codex-lens/tests/test_lsp_graph_builder_depth.py b/codex-lens/tests/test_lsp_graph_builder_depth.py deleted file mode 100644 index ab70b770..00000000 --- a/codex-lens/tests/test_lsp_graph_builder_depth.py +++ /dev/null @@ -1,36 +0,0 @@ -from __future__ import annotations - -import asyncio -from unittest.mock import AsyncMock - -import pytest - -from codexlens.hybrid_search.data_structures import CodeAssociationGraph, CodeSymbolNode, Range -from codexlens.lsp.lsp_graph_builder import LspGraphBuilder - - -@pytest.mark.asyncio -async def test_lsp_graph_builder_does_not_expand_at_max_depth() -> None: - """Depth semantics: max_depth is the number of hops from seeds.""" - builder = LspGraphBuilder(max_depth=1, max_nodes=10, max_concurrent=1, resolve_symbols=False) - - bridge = AsyncMock() - bridge.get_references.side_effect = RuntimeError("should not call references") - bridge.get_call_hierarchy.side_effect = RuntimeError("should not call call hierarchy") - - node = CodeSymbolNode( - id="x.py:foo:1", - name="foo", - kind="function", - file_path="x.py", - range=Range(start_line=1, start_character=1, end_line=1, end_character=1), - ) - graph = CodeAssociationGraph() - visited: set[str] = set() - sem = asyncio.Semaphore(1) - - # Seeds are depth=0. A node at depth==max_depth should not be expanded. - new_nodes = await builder._expand_node(node, 1, graph, bridge, visited, sem) # type: ignore[attr-defined] - assert new_nodes == [] - assert node.id in visited - diff --git a/codex-lens/tests/test_merkle_detection.py b/codex-lens/tests/test_merkle_detection.py deleted file mode 100644 index e4afdccd..00000000 --- a/codex-lens/tests/test_merkle_detection.py +++ /dev/null @@ -1,100 +0,0 @@ -import time -from pathlib import Path - -from codexlens.config import Config -from codexlens.storage.dir_index import DirIndexStore - - -def _make_merkle_config(tmp_path: Path) -> Config: - data_dir = tmp_path / "data" - return Config( - data_dir=data_dir, - venv_path=data_dir / "venv", - enable_merkle_detection=True, - ) - - -class TestMerkleDetection: - def test_needs_reindex_touch_updates_mtime(self, tmp_path: Path) -> None: - config = _make_merkle_config(tmp_path) - source_dir = tmp_path / "src" - source_dir.mkdir(parents=True, exist_ok=True) - - file_path = source_dir / "a.py" - file_path.write_text("print('hi')\n", encoding="utf-8") - original_content = file_path.read_text(encoding="utf-8") - - index_db = tmp_path / "_index.db" - with DirIndexStore(index_db, config=config) as store: - store.add_file( - name=file_path.name, - full_path=file_path, - content=original_content, - language="python", - symbols=[], - ) - - stored_mtime_before = store.get_file_mtime(file_path) - assert stored_mtime_before is not None - - # Touch file without changing content - time.sleep(0.02) - file_path.write_text(original_content, encoding="utf-8") - - assert store.needs_reindex(file_path) is False - - stored_mtime_after = store.get_file_mtime(file_path) - assert stored_mtime_after is not None - assert stored_mtime_after != stored_mtime_before - - current_mtime = file_path.stat().st_mtime - assert abs(stored_mtime_after - current_mtime) <= 0.001 - - def test_parent_root_changes_when_child_changes(self, tmp_path: Path) -> None: - config = _make_merkle_config(tmp_path) - - source_root = tmp_path / "project" - child_dir = source_root / "child" - child_dir.mkdir(parents=True, exist_ok=True) - - child_file = child_dir / "child.py" - child_file.write_text("x = 1\n", encoding="utf-8") - - child_db = tmp_path / "child_index.db" - parent_db = tmp_path / "parent_index.db" - - with DirIndexStore(child_db, config=config) as child_store: - child_store.add_file( - name=child_file.name, - full_path=child_file, - content=child_file.read_text(encoding="utf-8"), - language="python", - symbols=[], - ) - child_root_1 = child_store.update_merkle_root() - assert child_root_1 - - with DirIndexStore(parent_db, config=config) as parent_store: - parent_store.register_subdir(name="child", index_path=child_db, files_count=1) - parent_root_1 = parent_store.update_merkle_root() - assert parent_root_1 - - time.sleep(0.02) - child_file.write_text("x = 2\n", encoding="utf-8") - - with DirIndexStore(child_db, config=config) as child_store: - child_store.add_file( - name=child_file.name, - full_path=child_file, - content=child_file.read_text(encoding="utf-8"), - language="python", - symbols=[], - ) - child_root_2 = child_store.update_merkle_root() - assert child_root_2 - assert child_root_2 != child_root_1 - - with DirIndexStore(parent_db, config=config) as parent_store: - parent_root_2 = parent_store.update_merkle_root() - assert parent_root_2 - assert parent_root_2 != parent_root_1 diff --git a/codex-lens/tests/test_migrations.py b/codex-lens/tests/test_migrations.py deleted file mode 100644 index abf5c2c2..00000000 --- a/codex-lens/tests/test_migrations.py +++ /dev/null @@ -1,114 +0,0 @@ -"""Database Migration Tests. - -This module tests the database migration system for the codex-lens index, -ensuring that forward and backward compatibility is maintained across schema versions. - -Test Coverage: -- Forward migrations: Old schema to new schema -- Backward compatibility: New code can read old schemas -- Migration rollback capabilities -- Data integrity during migrations -- Edge cases (empty databases, corrupted data, etc.) -""" - -import pytest -import sqlite3 -from pathlib import Path -import tempfile -import json - - -class TestForwardMigrations: - """Test upgrading from older schema versions to newer ones.""" - - def test_v0_to_v1_migration(self): - """Test migration from schema v0 to v1.""" - pytest.skip("Requires migration infrastructure setup") - - def test_v1_to_v2_migration(self): - """Test migration from schema v1 to v2.""" - pytest.skip("Requires migration infrastructure setup") - - def test_migration_preserves_data(self): - """Test that migration preserves existing data.""" - pytest.skip("Requires migration infrastructure setup") - - def test_migration_adds_new_columns(self): - """Test that new columns are added with correct defaults.""" - pytest.skip("Requires migration infrastructure setup") - - -class TestBackwardCompatibility: - """Test that newer code can read and work with older database schemas.""" - - def test_new_code_reads_old_schema(self): - """Test that current code can read old schema databases.""" - pytest.skip("Requires old schema fixture") - - def test_new_code_writes_to_old_schema(self): - """Test that current code handles writes to old schema gracefully.""" - pytest.skip("Requires old schema fixture") - - def test_old_code_rejects_new_schema(self): - """Test that old code fails appropriately on new schemas.""" - pytest.skip("Requires old code fixture") - - -class TestMigrationRollback: - """Test rollback capabilities for failed migrations.""" - - def test_failed_migration_rolls_back(self): - """Test that failed migrations are rolled back completely.""" - pytest.skip("Requires migration infrastructure setup") - - def test_partial_migration_recovery(self): - """Test recovery from partially completed migrations.""" - pytest.skip("Requires migration infrastructure setup") - - def test_rollback_preserves_original_data(self): - """Test that rollback restores original state.""" - pytest.skip("Requires migration infrastructure setup") - - -class TestMigrationEdgeCases: - """Test migration behavior in edge cases.""" - - def test_empty_database_migration(self): - """Test migration of an empty database.""" - pytest.skip("Requires migration infrastructure setup") - - def test_large_database_migration(self): - """Test migration of a large database.""" - pytest.skip("Requires migration infrastructure setup") - - def test_corrupted_database_handling(self): - """Test handling of corrupted databases during migration.""" - pytest.skip("Requires migration infrastructure setup") - - def test_concurrent_migration_protection(self): - """Test that concurrent migrations are prevented.""" - pytest.skip("Requires migration infrastructure setup") - - -class TestSchemaVersionTracking: - """Test schema version tracking and detection.""" - - def test_version_table_exists(self): - """Test that version tracking table exists and is populated.""" - pytest.skip("Requires migration infrastructure setup") - - def test_version_auto_detection(self): - """Test that schema version is auto-detected from database.""" - pytest.skip("Requires migration infrastructure setup") - - def test_version_update_after_migration(self): - """Test that version is updated correctly after migration.""" - pytest.skip("Requires migration infrastructure setup") - - -# TODO: Implement actual tests using pytest fixtures -# The test infrastructure needs: -# - Migration runner fixture that can apply and rollback migrations -# - Old schema fixtures (pre-built databases with known schemas) -# - Temporary database fixtures for isolated testing -# - Mock data generators for various schema versions diff --git a/codex-lens/tests/test_parser_integration.py b/codex-lens/tests/test_parser_integration.py deleted file mode 100644 index f94d4162..00000000 --- a/codex-lens/tests/test_parser_integration.py +++ /dev/null @@ -1,281 +0,0 @@ -"""Integration tests for multi-level parser system. - -Verifies: -1. Tree-sitter primary, regex fallback -2. Tiktoken integration with character count fallback -3. >99% symbol extraction accuracy -4. Graceful degradation when dependencies unavailable -""" - -from pathlib import Path - -import pytest - -from codexlens.parsers.factory import SimpleRegexParser -from codexlens.parsers.tokenizer import Tokenizer, TIKTOKEN_AVAILABLE -from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser, TREE_SITTER_AVAILABLE - - -class TestMultiLevelFallback: - """Tests for multi-tier fallback pattern.""" - - def test_treesitter_available_uses_ast(self): - """Verify tree-sitter is used when available.""" - parser = TreeSitterSymbolParser("python") - assert parser.is_available() == TREE_SITTER_AVAILABLE - - def test_regex_fallback_always_works(self): - """Verify regex parser always works.""" - parser = SimpleRegexParser("python") - code = "def hello():\n pass" - result = parser.parse(code, Path("test.py")) - - assert result is not None - assert len(result.symbols) == 1 - assert result.symbols[0].name == "hello" - - def test_unsupported_language_uses_generic(self): - """Verify generic parser for unsupported languages.""" - parser = SimpleRegexParser("rust") - code = "fn main() {}" - result = parser.parse(code, Path("test.rs")) - - # Should use generic parser - assert result is not None - # May or may not find symbols depending on generic patterns - - -class TestTokenizerFallback: - """Tests for tokenizer fallback behavior.""" - - def test_character_fallback_when_tiktoken_unavailable(self): - """Verify character counting works without tiktoken.""" - # Use invalid encoding to force fallback - tokenizer = Tokenizer(encoding_name="invalid_encoding") - text = "Hello world" - - count = tokenizer.count_tokens(text) - assert count == max(1, len(text) // 4) - assert not tokenizer.is_using_tiktoken() - - def test_tiktoken_used_when_available(self): - """Verify tiktoken is used when available.""" - tokenizer = Tokenizer() - # Should match TIKTOKEN_AVAILABLE - assert tokenizer.is_using_tiktoken() == TIKTOKEN_AVAILABLE - - -class TestSymbolExtractionAccuracy: - """Tests for >99% symbol extraction accuracy requirement.""" - - @pytest.mark.skipif(not TREE_SITTER_AVAILABLE, reason="tree-sitter not installed") - def test_python_comprehensive_accuracy(self): - """Test comprehensive Python symbol extraction.""" - parser = TreeSitterSymbolParser("python") - code = """ -# Test comprehensive symbol extraction -import os - -CONSTANT = 42 - -def top_level_function(): - pass - -async def async_top_level(): - pass - -class FirstClass: - class_var = 10 - - def __init__(self): - pass - - def method_one(self): - pass - - def method_two(self): - pass - - @staticmethod - def static_method(): - pass - - @classmethod - def class_method(cls): - pass - - async def async_method(self): - pass - -def outer_function(): - def inner_function(): - pass - return inner_function - -class SecondClass: - def another_method(self): - pass - -async def final_async_function(): - pass -""" - result = parser.parse(code, Path("test.py")) - - assert result is not None - - # Expected symbols (excluding CONSTANT, comments, decorators): - # top_level_function, async_top_level, FirstClass, __init__, - # method_one, method_two, static_method, class_method, async_method, - # outer_function, inner_function, SecondClass, another_method, - # final_async_function - - expected_names = { - "top_level_function", "async_top_level", "FirstClass", - "__init__", "method_one", "method_two", "static_method", - "class_method", "async_method", "outer_function", - "inner_function", "SecondClass", "another_method", - "final_async_function" - } - - found_names = {s.name for s in result.symbols} - - # Calculate accuracy - matches = expected_names & found_names - accuracy = len(matches) / len(expected_names) * 100 - - print(f"\nSymbol extraction accuracy: {accuracy:.1f}%") - print(f"Expected: {len(expected_names)}, Found: {len(found_names)}, Matched: {len(matches)}") - print(f"Missing: {expected_names - found_names}") - print(f"Extra: {found_names - expected_names}") - - # Require >99% accuracy - assert accuracy > 99.0, f"Accuracy {accuracy:.1f}% below 99% threshold" - - @pytest.mark.skipif(not TREE_SITTER_AVAILABLE, reason="tree-sitter not installed") - def test_javascript_comprehensive_accuracy(self): - """Test comprehensive JavaScript symbol extraction.""" - parser = TreeSitterSymbolParser("javascript") - code = """ -function regularFunction() {} - -const arrowFunc = () => {} - -async function asyncFunc() {} - -const asyncArrow = async () => {} - -class MainClass { - constructor() {} - - method() {} - - async asyncMethod() {} - - static staticMethod() {} -} - -export function exportedFunc() {} - -export const exportedArrow = () => {} - -export class ExportedClass { - method() {} -} - -function outer() { - function inner() {} -} -""" - result = parser.parse(code, Path("test.js")) - - assert result is not None - - # Expected symbols (excluding constructor): - # regularFunction, arrowFunc, asyncFunc, asyncArrow, MainClass, - # method, asyncMethod, staticMethod, exportedFunc, exportedArrow, - # ExportedClass, method (from ExportedClass), outer, inner - - expected_names = { - "regularFunction", "arrowFunc", "asyncFunc", "asyncArrow", - "MainClass", "method", "asyncMethod", "staticMethod", - "exportedFunc", "exportedArrow", "ExportedClass", "outer", "inner" - } - - found_names = {s.name for s in result.symbols} - - # Calculate accuracy - matches = expected_names & found_names - accuracy = len(matches) / len(expected_names) * 100 - - print(f"\nJavaScript symbol extraction accuracy: {accuracy:.1f}%") - print(f"Expected: {len(expected_names)}, Found: {len(found_names)}, Matched: {len(matches)}") - - # Require >99% accuracy - assert accuracy > 99.0, f"Accuracy {accuracy:.1f}% below 99% threshold" - - -class TestGracefulDegradation: - """Tests for graceful degradation when dependencies missing.""" - - def test_system_functional_without_tiktoken(self): - """Verify system works without tiktoken.""" - # Force fallback - tokenizer = Tokenizer(encoding_name="invalid") - assert not tokenizer.is_using_tiktoken() - - # Should still work - count = tokenizer.count_tokens("def hello(): pass") - assert count > 0 - - def test_system_functional_without_treesitter(self): - """Verify system works without tree-sitter.""" - # Use regex parser directly - parser = SimpleRegexParser("python") - code = "def hello():\n pass" - result = parser.parse(code, Path("test.py")) - - assert result is not None - assert len(result.symbols) == 1 - - def test_treesitter_parser_returns_none_for_unsupported(self): - """Verify TreeSitterParser returns None for unsupported languages.""" - parser = TreeSitterSymbolParser("rust") # Not supported - assert not parser.is_available() - - result = parser.parse("fn main() {}", Path("test.rs")) - assert result is None - - -class TestRealWorldFiles: - """Tests with real-world code examples.""" - - @pytest.mark.skipif(not TREE_SITTER_AVAILABLE, reason="tree-sitter not installed") - def test_parser_on_own_source(self): - """Test parser on its own source code.""" - parser = TreeSitterSymbolParser("python") - - # Read the parser module itself - parser_file = Path(__file__).parent.parent / "src" / "codexlens" / "parsers" / "treesitter_parser.py" - if parser_file.exists(): - code = parser_file.read_text(encoding="utf-8") - result = parser.parse(code, parser_file) - - assert result is not None - # Should find the TreeSitterSymbolParser class and its methods - names = {s.name for s in result.symbols} - assert "TreeSitterSymbolParser" in names - - def test_tokenizer_on_own_source(self): - """Test tokenizer on its own source code.""" - tokenizer = Tokenizer() - - # Read the tokenizer module itself - tokenizer_file = Path(__file__).parent.parent / "src" / "codexlens" / "parsers" / "tokenizer.py" - if tokenizer_file.exists(): - code = tokenizer_file.read_text(encoding="utf-8") - count = tokenizer.count_tokens(code) - - # Should get reasonable token count - assert count > 0 - # File is several hundred characters, should be 50+ tokens - assert count > 50 diff --git a/codex-lens/tests/test_parsers.py b/codex-lens/tests/test_parsers.py deleted file mode 100644 index 9651fddc..00000000 --- a/codex-lens/tests/test_parsers.py +++ /dev/null @@ -1,462 +0,0 @@ -"""Tests for CodexLens parsers.""" - -import tempfile -from pathlib import Path - -import pytest - -from codexlens.config import Config -from codexlens.parsers.factory import ( - ParserFactory, - SimpleRegexParser, - _parse_go_symbols, - _parse_java_symbols, - _parse_js_ts_symbols, - _parse_python_symbols, - _parse_generic_symbols, -) - - -TREE_SITTER_JS_AVAILABLE = True -try: - import tree_sitter_javascript # type: ignore[import-not-found] # noqa: F401 -except Exception: - TREE_SITTER_JS_AVAILABLE = False - - -class TestPythonParser: - """Tests for Python symbol parsing.""" - - def test_parse_function(self): - code = "def hello():\n pass" - symbols = _parse_python_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "hello" - assert symbols[0].kind == "function" - - def test_parse_async_function(self): - code = "async def fetch_data():\n pass" - symbols = _parse_python_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "fetch_data" - assert symbols[0].kind == "function" - - def test_parse_class(self): - code = "class MyClass:\n pass" - symbols = _parse_python_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "MyClass" - assert symbols[0].kind == "class" - - def test_parse_method(self): - code = "class MyClass:\n def method(self):\n pass" - symbols = _parse_python_symbols(code) - assert len(symbols) == 2 - assert symbols[0].name == "MyClass" - assert symbols[0].kind == "class" - assert symbols[1].name == "method" - assert symbols[1].kind == "method" - - def test_parse_async_method(self): - code = "class MyClass:\n async def async_method(self):\n pass" - symbols = _parse_python_symbols(code) - assert len(symbols) == 2 - assert symbols[1].name == "async_method" - assert symbols[1].kind == "method" - - -class TestJavaScriptParser: - """Tests for JavaScript/TypeScript symbol parsing.""" - - def test_parse_function(self): - code = "function hello() {}" - symbols = _parse_js_ts_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "hello" - assert symbols[0].kind == "function" - - def test_parse_async_function(self): - code = "async function fetchData() {}" - symbols = _parse_js_ts_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "fetchData" - assert symbols[0].kind == "function" - - def test_parse_arrow_function(self): - code = "const hello = () => {}" - symbols = _parse_js_ts_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "hello" - assert symbols[0].kind == "function" - - def test_parse_async_arrow_function(self): - code = "const fetchData = async () => {}" - symbols = _parse_js_ts_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "fetchData" - assert symbols[0].kind == "function" - - def test_parse_class(self): - code = "class MyClass {}" - symbols = _parse_js_ts_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "MyClass" - assert symbols[0].kind == "class" - - def test_parse_export_function(self): - code = "export function hello() {}" - symbols = _parse_js_ts_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "hello" - assert symbols[0].kind == "function" - - def test_parse_export_class(self): - code = "export class MyClass {}" - symbols = _parse_js_ts_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "MyClass" - assert symbols[0].kind == "class" - - def test_parse_export_arrow_function(self): - code = "export const hello = () => {}" - symbols = _parse_js_ts_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "hello" - assert symbols[0].kind == "function" - - @pytest.mark.skipif(not TREE_SITTER_JS_AVAILABLE, reason="tree-sitter-javascript not installed") - def test_parse_class_methods(self): - code = ( - "class MyClass {\n" - " method() {}\n" - " async asyncMethod() {}\n" - " static staticMethod() {}\n" - " constructor() {}\n" - "}" - ) - symbols = _parse_js_ts_symbols(code) - names_kinds = [(s.name, s.kind) for s in symbols] - assert ("MyClass", "class") in names_kinds - assert ("method", "method") in names_kinds - assert ("asyncMethod", "method") in names_kinds - assert ("staticMethod", "method") in names_kinds - assert all(name != "constructor" for name, _ in names_kinds) - - -class TestJavaParser: - """Tests for Java symbol parsing.""" - - def test_parse_class(self): - code = "public class MyClass {\n}" - symbols = _parse_java_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "MyClass" - assert symbols[0].kind == "class" - - def test_parse_class_without_public(self): - code = "class InternalClass {\n}" - symbols = _parse_java_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "InternalClass" - - def test_parse_method(self): - code = "public class Test {\n public void doSomething() {}\n}" - symbols = _parse_java_symbols(code) - assert len(symbols) == 2 - assert symbols[0].name == "Test" - assert symbols[0].kind == "class" - assert symbols[1].name == "doSomething" - assert symbols[1].kind == "method" - - def test_parse_static_method(self): - code = "public class Test {\n public static void main(String[] args) {}\n}" - symbols = _parse_java_symbols(code) - method_names = [s.name for s in symbols if s.kind == "method"] - assert "main" in method_names - - def test_parse_private_method(self): - code = "public class Test {\n private int calculate() { return 0; }\n}" - symbols = _parse_java_symbols(code) - method_names = [s.name for s in symbols if s.kind == "method"] - assert "calculate" in method_names - - def test_parse_generic_return_type(self): - code = "public class Test {\n public List getItems() { return null; }\n}" - symbols = _parse_java_symbols(code) - method_names = [s.name for s in symbols if s.kind == "method"] - assert "getItems" in method_names - - -class TestGoParser: - """Tests for Go symbol parsing.""" - - def test_parse_function(self): - code = "func hello() {\n}" - symbols = _parse_go_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "hello" - assert symbols[0].kind == "function" - - def test_parse_function_with_params(self): - code = "func greet(name string) string {\n return name\n}" - symbols = _parse_go_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "greet" - - def test_parse_method(self): - code = "func (s *Server) Start() error {\n return nil\n}" - symbols = _parse_go_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "Start" - assert symbols[0].kind == "function" - - def test_parse_struct(self): - code = "type User struct {\n Name string\n}" - symbols = _parse_go_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "User" - assert symbols[0].kind == "class" - - def test_parse_interface(self): - code = "type Reader interface {\n Read(p []byte) (n int, err error)\n}" - symbols = _parse_go_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "Reader" - assert symbols[0].kind == "class" - - def test_parse_multiple_symbols(self): - code = """type Config struct { - Port int -} - -func NewConfig() *Config { - return &Config{} -} - -func (c *Config) Validate() error { - return nil -} -""" - symbols = _parse_go_symbols(code) - names = [s.name for s in symbols] - assert "Config" in names - assert "NewConfig" in names - assert "Validate" in names - - -class TestGenericParser: - """Tests for generic symbol parsing.""" - - def test_parse_def_keyword(self): - code = "def something():\n pass" - symbols = _parse_generic_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "something" - assert symbols[0].kind == "function" - - def test_parse_function_keyword(self): - code = "function doIt() {}" - symbols = _parse_generic_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "doIt" - - def test_parse_func_keyword(self): - code = "func test() {}" - symbols = _parse_generic_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "test" - - def test_parse_class_keyword(self): - code = "class MyClass {}" - symbols = _parse_generic_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "MyClass" - assert symbols[0].kind == "class" - - def test_parse_struct_keyword(self): - code = "struct Point { x: i32, y: i32 }" - symbols = _parse_generic_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "Point" - assert symbols[0].kind == "class" - - def test_parse_interface_keyword(self): - code = "interface Drawable {}" - symbols = _parse_generic_symbols(code) - assert len(symbols) == 1 - assert symbols[0].name == "Drawable" - assert symbols[0].kind == "class" - - -class TestParserInterface: - """High-level interface tests.""" - - def test_simple_parser_parse(self): - parser = SimpleRegexParser("python") - indexed = parser.parse("def hello():\n pass", Path("test.py")) - assert indexed.language == "python" - assert len(indexed.symbols) == 1 - assert indexed.symbols[0].name == "hello" - - def test_simple_parser_javascript(self): - parser = SimpleRegexParser("javascript") - indexed = parser.parse("function test() {}", Path("test.js")) - assert indexed.language == "javascript" - assert len(indexed.symbols) == 1 - - def test_simple_parser_typescript(self): - parser = SimpleRegexParser("typescript") - indexed = parser.parse("export class Service {}", Path("test.ts")) - assert indexed.language == "typescript" - assert len(indexed.symbols) == 1 - - def test_simple_parser_java(self): - parser = SimpleRegexParser("java") - indexed = parser.parse("public class Main {}", Path("Main.java")) - assert indexed.language == "java" - assert len(indexed.symbols) == 1 - - def test_simple_parser_go(self): - parser = SimpleRegexParser("go") - indexed = parser.parse("func main() {}", Path("main.go")) - assert indexed.language == "go" - assert len(indexed.symbols) == 1 - - def test_simple_parser_unknown_language(self): - parser = SimpleRegexParser("zig") - indexed = parser.parse("fn main() void {}", Path("main.zig")) - assert indexed.language == "zig" - # Uses generic parser - assert indexed.chunks == [] - - def test_indexed_file_path_resolved(self): - parser = SimpleRegexParser("python") - indexed = parser.parse("def test(): pass", Path("./test.py")) - # Path should be resolved to absolute - assert Path(indexed.path).is_absolute() - - -class TestParserFactory: - """Tests for ParserFactory.""" - - def test_factory_creates_parser(self): - with tempfile.TemporaryDirectory() as tmpdir: - import os - os.environ["CODEXLENS_DATA_DIR"] = tmpdir - try: - config = Config() - factory = ParserFactory(config) - parser = factory.get_parser("python") - assert parser is not None - finally: - del os.environ["CODEXLENS_DATA_DIR"] - - def test_factory_caches_parsers(self): - with tempfile.TemporaryDirectory() as tmpdir: - import os - os.environ["CODEXLENS_DATA_DIR"] = tmpdir - try: - config = Config() - factory = ParserFactory(config) - parser1 = factory.get_parser("python") - parser2 = factory.get_parser("python") - assert parser1 is parser2 - finally: - del os.environ["CODEXLENS_DATA_DIR"] - - def test_factory_different_languages(self): - with tempfile.TemporaryDirectory() as tmpdir: - import os - os.environ["CODEXLENS_DATA_DIR"] = tmpdir - try: - config = Config() - factory = ParserFactory(config) - py_parser = factory.get_parser("python") - js_parser = factory.get_parser("javascript") - assert py_parser is not js_parser - finally: - del os.environ["CODEXLENS_DATA_DIR"] - - def test_factory_passes_config_to_treesitter(self, monkeypatch: pytest.MonkeyPatch) -> None: - """Ensure ParserFactory config is forwarded into TreeSitterSymbolParser.""" - from codexlens.entities import IndexedFile - - captured: dict = {} - - class FakeTreeSitterSymbolParser: - def __init__(self, language_id, path=None, config=None) -> None: - captured["config"] = config - self.language_id = language_id - - def is_available(self) -> bool: - return True - - def parse(self, text: str, path: Path) -> IndexedFile: - return IndexedFile( - path=str(path.resolve()), - language=self.language_id, - symbols=[], - chunks=[], - relationships=[], - ) - - monkeypatch.setattr( - "codexlens.parsers.factory.TreeSitterSymbolParser", - FakeTreeSitterSymbolParser, - ) - - config = Config() - config.use_astgrep = True - - factory = ParserFactory(config) - parser = factory.get_parser("python") - parser.parse("def hello():\n pass\n", Path("test.py")) - - assert captured.get("config") is config - - -class TestParserEdgeCases: - """Edge case tests for parsers.""" - - def test_empty_code(self): - symbols = _parse_python_symbols("") - assert len(symbols) == 0 - - def test_only_comments(self): - code = "# This is a comment\n# Another comment" - symbols = _parse_python_symbols(code) - assert len(symbols) == 0 - - def test_nested_functions(self): - code = """def outer(): - def inner(): - pass - return inner -""" - symbols = _parse_python_symbols(code) - names = [s.name for s in symbols] - assert "outer" in names - assert "inner" in names - - def test_unicode_function_name(self): - code = "def 你好():\n pass" - symbols = _parse_python_symbols(code) - # Regex may not support unicode function names, tree-sitter does - # So we just verify it doesn't crash - assert isinstance(symbols, list) - - def test_long_file(self): - # Generate a file with many functions - lines = [] - for i in range(100): - lines.append(f"def func_{i}():\n pass\n") - code = "\n".join(lines) - symbols = _parse_python_symbols(code) - assert len(symbols) == 100 - - def test_malformed_code(self): - # Parser should handle malformed code gracefully - code = "def broken(\n pass" - # Should not crash - symbols = _parse_python_symbols(code) - # May or may not find symbols depending on regex diff --git a/codex-lens/tests/test_path_mapper_windows_drive.py b/codex-lens/tests/test_path_mapper_windows_drive.py deleted file mode 100644 index 21522b8d..00000000 --- a/codex-lens/tests/test_path_mapper_windows_drive.py +++ /dev/null @@ -1,19 +0,0 @@ -from __future__ import annotations - -import platform -from pathlib import Path - -from codexlens.storage.path_mapper import PathMapper - - -def test_denormalize_path_windows_drive_is_absolute() -> None: - if platform.system() != "Windows": - return - - mapper = PathMapper(index_root=Path("C:/tmp/codexlens_indexes")) - mapped = mapper.denormalize_path("D/Claude_dms3/codex-lens/src") - - assert mapped.is_absolute() - assert str(mapped).lower().startswith("d:\\") or str(mapped).lower().startswith("d:/") - assert mapped == Path("D:/Claude_dms3/codex-lens/src") - diff --git a/codex-lens/tests/test_performance_optimizations.py b/codex-lens/tests/test_performance_optimizations.py deleted file mode 100644 index 1026cb59..00000000 --- a/codex-lens/tests/test_performance_optimizations.py +++ /dev/null @@ -1,814 +0,0 @@ -"""Tests for performance optimizations in CodexLens. - -This module tests the following optimizations: -1. Normalized keywords search (migration_001) -2. Optimized path lookup in registry -3. Prefix-mode symbol search -4. Graph expansion neighbor precompute overhead (<20%) -5. Cross-encoder reranking latency (<200ms) -""" - -import json -import sqlite3 -import tempfile -import time -from pathlib import Path - -import pytest - -from codexlens.storage.dir_index import DirIndexStore -from codexlens.storage.registry import RegistryStore -from codexlens.storage.migration_manager import MigrationManager -from codexlens.storage.migrations import migration_001_normalize_keywords - - -@pytest.fixture -def temp_index_db(): - """Create a temporary dir index database.""" - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "test_index.db" - store = DirIndexStore(db_path) - store.initialize() # Initialize schema - yield store - store.close() - - -@pytest.fixture -def temp_registry_db(): - """Create a temporary registry database.""" - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "test_registry.db" - store = RegistryStore(db_path) - store.initialize() # Initialize schema - yield store - store.close() - - -@pytest.fixture -def populated_index_db(temp_index_db): - """Create an index database with sample data. - - Uses 100 files to provide meaningful performance comparison between - optimized and fallback implementations. - """ - from codexlens.entities import Symbol - - store = temp_index_db - - # Add files with symbols and keywords - # Using 100 files to show performance improvements - file_ids = [] - - # Define keyword pools for cycling - keyword_pools = [ - ["auth", "security", "jwt"], - ["database", "sql", "query"], - ["auth", "login", "password"], - ["api", "rest", "endpoint"], - ["cache", "redis", "performance"], - ["auth", "oauth", "token"], - ["test", "unittest", "pytest"], - ["database", "postgres", "migration"], - ["api", "graphql", "resolver"], - ["security", "encryption", "crypto"] - ] - - for i in range(100): - # Create symbols for first 50 files to have more symbol search data - symbols = None - if i < 50: - symbols = [ - Symbol(name=f"get_user_{i}", kind="function", range=(1, 10)), - Symbol(name=f"create_user_{i}", kind="function", range=(11, 20)), - Symbol(name=f"UserClass_{i}", kind="class", range=(21, 40)), - ] - - file_id = store.add_file( - name=f"file_{i}.py", - full_path=Path(f"/test/path/file_{i}.py"), - content=f"def function_{i}(): pass\n" * 10, - language="python", - symbols=symbols - ) - file_ids.append(file_id) - - # Add semantic metadata with keywords (cycle through keyword pools) - keywords = keyword_pools[i % len(keyword_pools)] - store.add_semantic_metadata( - file_id=file_id, - summary=f"Test file {file_id}", - keywords=keywords, - purpose="Testing", - llm_tool="gemini" - ) - - return store - - -class TestKeywordNormalization: - """Test normalized keywords functionality.""" - - def test_migration_creates_tables(self, temp_index_db): - """Test that migration creates keywords and file_keywords tables.""" - conn = temp_index_db._get_connection() - - # Verify tables exist (created by _create_schema) - tables = conn.execute(""" - SELECT name FROM sqlite_master - WHERE type='table' AND name IN ('keywords', 'file_keywords') - """).fetchall() - - assert len(tables) == 2 - - def test_migration_creates_indexes(self, temp_index_db): - """Test that migration creates necessary indexes.""" - conn = temp_index_db._get_connection() - - # Check for indexes - indexes = conn.execute(""" - SELECT name FROM sqlite_master - WHERE type='index' AND name IN ( - 'idx_keywords_keyword', - 'idx_file_keywords_file_id', - 'idx_file_keywords_keyword_id' - ) - """).fetchall() - - assert len(indexes) == 3 - - def test_add_semantic_metadata_populates_normalized_tables(self, temp_index_db): - """Test that adding metadata populates the normalized keyword tables.""" - # Add a file - file_id = temp_index_db.add_file( - name="test.py", - full_path=Path("/test/test.py"), - language="python", - content="test" - ) - - # Add semantic metadata - keywords = ["auth", "security", "jwt"] - temp_index_db.add_semantic_metadata( - file_id=file_id, - summary="Test summary", - keywords=keywords, - purpose="Testing", - llm_tool="gemini" - ) - - conn = temp_index_db._get_connection() - - # Check semantic_metadata table (without keywords column in current schema) - row = conn.execute( - "SELECT summary, purpose, llm_tool FROM semantic_metadata WHERE file_id=?", - (file_id,) - ).fetchone() - assert row is not None - assert row["summary"] == "Test summary" - assert row["purpose"] == "Testing" - assert row["llm_tool"] == "gemini" - - # Check normalized keywords table - keyword_rows = conn.execute(""" - SELECT k.keyword - FROM file_keywords fk - JOIN keywords k ON fk.keyword_id = k.id - WHERE fk.file_id = ? - """, (file_id,)).fetchall() - - assert len(keyword_rows) == 3 - normalized_keywords = [row["keyword"] for row in keyword_rows] - assert set(normalized_keywords) == set(keywords) - - def test_search_semantic_keywords_normalized(self, populated_index_db): - """Test optimized keyword search using normalized tables.""" - results = populated_index_db.search_semantic_keywords("auth", use_normalized=True) - - # Should find 3 files with "auth" keyword - assert len(results) >= 3 - - # Verify results structure - for file_entry, keywords in results: - assert file_entry.name.startswith("file_") - assert isinstance(keywords, list) - assert any("auth" in k.lower() for k in keywords) - - def test_search_semantic_keywords_fallback(self, populated_index_db): - """Test that fallback search still works.""" - results = populated_index_db.search_semantic_keywords("auth", use_normalized=False) - - # Should find files with "auth" keyword - assert len(results) >= 3 - - for file_entry, keywords in results: - assert isinstance(keywords, list) - - -class TestPathLookupOptimization: - """Test optimized path lookup in registry.""" - - def test_find_nearest_index_shallow(self, temp_registry_db): - """Test path lookup with shallow directory structure.""" - # Register a project first - project = temp_registry_db.register_project( - source_root=Path("/test"), - index_root=Path("/tmp") - ) - - # Register directory mapping - temp_registry_db.register_dir( - project_id=project.id, - source_path=Path("/test"), - index_path=Path("/tmp/index.db"), - depth=0, - files_count=0 - ) - - # Search for subdirectory - result = temp_registry_db.find_nearest_index(Path("/test/subdir/file.py")) - - assert result is not None - # Compare as strings for cross-platform compatibility - assert "/test" in str(result.source_path) or "\\test" in str(result.source_path) - - def test_find_nearest_index_deep(self, temp_registry_db): - """Test path lookup with deep directory structure.""" - # Register a project - project = temp_registry_db.register_project( - source_root=Path("/a"), - index_root=Path("/tmp") - ) - - # Add directory mappings at different levels - temp_registry_db.register_dir( - project_id=project.id, - source_path=Path("/a"), - index_path=Path("/tmp/index_a.db"), - depth=0, - files_count=0 - ) - temp_registry_db.register_dir( - project_id=project.id, - source_path=Path("/a/b/c"), - index_path=Path("/tmp/index_abc.db"), - depth=2, - files_count=0 - ) - - # Should find nearest (longest) match - result = temp_registry_db.find_nearest_index(Path("/a/b/c/d/e/f/file.py")) - - assert result is not None - # Check that path contains the key parts - result_path = str(result.source_path) - assert "a" in result_path and "b" in result_path and "c" in result_path - - def test_find_nearest_index_not_found(self, temp_registry_db): - """Test path lookup when no mapping exists.""" - result = temp_registry_db.find_nearest_index(Path("/nonexistent/path")) - assert result is None - - def test_find_nearest_index_performance(self, temp_registry_db): - """Basic performance test for path lookup.""" - # Register a project - project = temp_registry_db.register_project( - source_root=Path("/root"), - index_root=Path("/tmp") - ) - - # Add mapping at root - temp_registry_db.register_dir( - project_id=project.id, - source_path=Path("/root"), - index_path=Path("/tmp/index.db"), - depth=0, - files_count=0 - ) - - # Test with very deep path (10 levels) - deep_path = Path("/root/a/b/c/d/e/f/g/h/i/j/file.py") - - start = time.perf_counter() - result = temp_registry_db.find_nearest_index(deep_path) - elapsed = time.perf_counter() - start - - # Should complete quickly (< 50ms even on slow systems) - assert elapsed < 0.05 - assert result is not None - - -class TestSymbolSearchOptimization: - """Test optimized symbol search.""" - - def test_symbol_search_prefix_mode(self, populated_index_db): - """Test symbol search with prefix mode.""" - results = populated_index_db.search_symbols("get", prefix_mode=True) - - # Should find symbols starting with "get" - assert len(results) > 0 - for symbol in results: - assert symbol.name.startswith("get") - - def test_symbol_search_substring_mode(self, populated_index_db): - """Test symbol search with substring mode.""" - results = populated_index_db.search_symbols("user", prefix_mode=False) - - # Should find symbols containing "user" - assert len(results) > 0 - for symbol in results: - assert "user" in symbol.name.lower() - - def test_symbol_search_with_kind_filter(self, populated_index_db): - """Test symbol search with kind filter.""" - results = populated_index_db.search_symbols( - "UserClass", - kind="class", - prefix_mode=True - ) - - # Should find only class symbols - assert len(results) > 0 - for symbol in results: - assert symbol.kind == "class" - - def test_symbol_search_limit(self, populated_index_db): - """Test symbol search respects limit.""" - results = populated_index_db.search_symbols("", prefix_mode=True, limit=5) - - # Should return at most 5 results - assert len(results) <= 5 - - -class TestMigrationManager: - """Test migration manager functionality.""" - - def test_migration_manager_tracks_version(self, temp_index_db): - """Test that migration manager tracks schema version.""" - conn = temp_index_db._get_connection() - manager = MigrationManager(conn) - - current_version = manager.get_current_version() - assert current_version >= 0 - - def test_migration_001_can_run(self, temp_index_db): - """Test that migration_001 is idempotent on current schema. - - Note: Current schema already has normalized keywords tables created - during initialize(), so migration_001 should be a no-op but not fail. - The original migration was designed to migrate from semantic_metadata.keywords - to normalized tables, but new databases use normalized tables directly. - """ - conn = temp_index_db._get_connection() - - # Add some test data using the current normalized schema - conn.execute(""" - INSERT INTO files(id, name, full_path, language, content, mtime, line_count) - VALUES(100, 'test.py', '/test_migration.py', 'python', 'def test(): pass', 0, 10) - """) - - # Insert directly into normalized tables (current schema) - conn.execute("INSERT OR IGNORE INTO keywords(keyword) VALUES(?)", ("test",)) - conn.execute("INSERT OR IGNORE INTO keywords(keyword) VALUES(?)", ("keyword",)) - - kw1_id = conn.execute("SELECT id FROM keywords WHERE keyword=?", ("test",)).fetchone()[0] - kw2_id = conn.execute("SELECT id FROM keywords WHERE keyword=?", ("keyword",)).fetchone()[0] - - conn.execute("INSERT OR IGNORE INTO file_keywords(file_id, keyword_id) VALUES(?, ?)", (100, kw1_id)) - conn.execute("INSERT OR IGNORE INTO file_keywords(file_id, keyword_id) VALUES(?, ?)", (100, kw2_id)) - conn.commit() - - # Run migration (should be idempotent - tables already exist) - try: - migration_001_normalize_keywords.upgrade(conn) - success = True - except Exception as e: - success = False - print(f"Migration failed: {e}") - - assert success - - # Verify data still exists - keyword_count = conn.execute(""" - SELECT COUNT(*) as c FROM file_keywords WHERE file_id=100 - """).fetchone()["c"] - - assert keyword_count == 2 # "test" and "keyword" - - -class TestPerformanceComparison: - """Compare performance of old vs new implementations.""" - - def test_keyword_search_performance(self, populated_index_db): - """Compare keyword search performance. - - IMPORTANT: The normalized query optimization is designed for large datasets - (1000+ files). On small datasets (< 1000 files), the overhead of JOINs and - GROUP BY operations can make the normalized query slower than the simple - LIKE query on JSON fields. This is expected behavior. - - Performance benefits appear when: - - Dataset size > 1000 files - - Full-table scans on JSON LIKE become the bottleneck - - Index-based lookups provide O(log N) complexity advantage - """ - # Normalized search - start = time.perf_counter() - normalized_results = populated_index_db.search_semantic_keywords( - "auth", - use_normalized=True - ) - normalized_time = time.perf_counter() - start - - # Fallback search - start = time.perf_counter() - fallback_results = populated_index_db.search_semantic_keywords( - "auth", - use_normalized=False - ) - fallback_time = time.perf_counter() - start - - # Verify correctness: both queries should return identical results - assert len(normalized_results) == len(fallback_results) - - # Verify result content matches - normalized_files = {entry.id for entry, _ in normalized_results} - fallback_files = {entry.id for entry, _ in fallback_results} - assert normalized_files == fallback_files, "Both queries must return same files" - - # Document performance characteristics (no strict assertion) - # On datasets < 1000 files, normalized may be slower due to JOIN overhead - print(f"\nKeyword search performance (100 files):") - print(f" Normalized: {normalized_time*1000:.3f}ms") - print(f" Fallback: {fallback_time*1000:.3f}ms") - print(f" Ratio: {normalized_time/fallback_time:.2f}x") - print(f" Note: Performance benefits appear with 1000+ files") - - def test_prefix_vs_substring_symbol_search(self, populated_index_db): - """Compare prefix vs substring symbol search performance. - - IMPORTANT: Prefix search optimization (LIKE 'prefix%') benefits from B-tree - indexes, but on small datasets (< 1000 symbols), the performance difference - may not be measurable or may even be slower due to query planner overhead. - - Performance benefits appear when: - - Symbol count > 1000 - - Index-based prefix search provides O(log N) advantage - - Full table scans with LIKE '%substring%' become bottleneck - """ - # Prefix search (optimized) - start = time.perf_counter() - prefix_results = populated_index_db.search_symbols("get", prefix_mode=True) - prefix_time = time.perf_counter() - start - - # Substring search (fallback) - start = time.perf_counter() - substring_results = populated_index_db.search_symbols("get", prefix_mode=False) - substring_time = time.perf_counter() - start - - # Verify correctness: prefix results should be subset of substring results - prefix_names = {s.name for s in prefix_results} - substring_names = {s.name for s in substring_results} - assert prefix_names.issubset(substring_names), "Prefix must be subset of substring" - - # Verify all prefix results actually start with search term - for symbol in prefix_results: - assert symbol.name.startswith("get"), f"Symbol {symbol.name} should start with 'get'" - - # Document performance characteristics (no strict assertion) - # On datasets < 1000 symbols, performance difference is negligible - print(f"\nSymbol search performance (150 symbols):") - print(f" Prefix: {prefix_time*1000:.3f}ms ({len(prefix_results)} results)") - print(f" Substring: {substring_time*1000:.3f}ms ({len(substring_results)} results)") - print(f" Ratio: {prefix_time/substring_time:.2f}x") - print(f" Note: Performance benefits appear with 1000+ symbols") - - -class TestPerformanceBenchmarks: - """Benchmark-style assertions for key performance requirements.""" - - def test_graph_expansion_indexing_overhead_under_20_percent(self, temp_index_db, tmp_path): - """Graph neighbor precompute adds <20% overhead versus indexing baseline.""" - from codexlens.entities import CodeRelationship, RelationshipType, Symbol - from codexlens.storage.index_tree import _compute_graph_neighbors - - store = temp_index_db - - file_count = 60 - symbols_per_file = 8 - - start = time.perf_counter() - for file_idx in range(file_count): - file_path = tmp_path / f"graph_{file_idx}.py" - lines = [] - for sym_idx in range(symbols_per_file): - lines.append(f"def func_{file_idx}_{sym_idx}():") - lines.append(f" return {sym_idx}") - lines.append("") - content = "\n".join(lines) - - symbols = [ - Symbol( - name=f"func_{file_idx}_{sym_idx}", - kind="function", - range=(sym_idx * 3 + 1, sym_idx * 3 + 2), - file=str(file_path), - ) - for sym_idx in range(symbols_per_file) - ] - - relationships = [ - CodeRelationship( - source_symbol=f"func_{file_idx}_{sym_idx}", - target_symbol=f"func_{file_idx}_{sym_idx + 1}", - relationship_type=RelationshipType.CALL, - source_file=str(file_path), - target_file=None, - source_line=sym_idx * 3 + 2, - ) - for sym_idx in range(symbols_per_file - 1) - ] - - store.add_file( - name=file_path.name, - full_path=file_path, - content=content, - language="python", - symbols=symbols, - relationships=relationships, - ) - baseline_time = time.perf_counter() - start - - durations = [] - for _ in range(3): - start = time.perf_counter() - _compute_graph_neighbors(store) - durations.append(time.perf_counter() - start) - graph_time = min(durations) - - # Sanity-check that the benchmark exercised graph neighbor generation. - conn = store._get_connection() - neighbor_count = conn.execute( - "SELECT COUNT(*) as c FROM graph_neighbors" - ).fetchone()["c"] - assert neighbor_count > 0 - - assert baseline_time > 0.0 - overhead_ratio = graph_time / baseline_time - assert overhead_ratio < 0.2, ( - f"Graph neighbor precompute overhead too high: {overhead_ratio:.2%} " - f"(baseline={baseline_time:.3f}s, graph={graph_time:.3f}s)" - ) - - def test_stage2_expansion_precomputed_vs_static_global_graph_benchmark(self, tmp_path): - """Benchmark Stage-2 expansion: precomputed graph_neighbors vs static global graph. - - This test is informational (prints timings) and asserts only correctness - and that both expanders return some related results. - """ - from codexlens.entities import CodeRelationship, RelationshipType, SearchResult, Symbol - from codexlens.search.graph_expander import GraphExpander - from codexlens.search.global_graph_expander import GlobalGraphExpander - from codexlens.storage.dir_index import DirIndexStore - from codexlens.storage.global_index import GlobalSymbolIndex - from codexlens.storage.index_tree import _compute_graph_neighbors - from codexlens.storage.path_mapper import PathMapper - - # Source + index roots - source_dir = tmp_path / "proj" / "src" - source_dir.mkdir(parents=True, exist_ok=True) - mapper = PathMapper(index_root=tmp_path / "indexes") - - index_db_path = mapper.source_to_index_db(source_dir) - index_db_path.parent.mkdir(parents=True, exist_ok=True) - - store = DirIndexStore(index_db_path) - store.initialize() - - file_count = 30 - per_file_symbols = 2 - file_paths = [] - per_file_symbols_list = [] - per_file_relationships_list = [] - - for i in range(file_count): - file_path = source_dir / f"m{i}.py" - file_paths.append(file_path) - file_path.write_text("pass\n", encoding="utf-8") - - symbols = [ - Symbol( - name=f"func_{i}_{j}", - kind="function", - range=(j + 1, j + 1), - file=str(file_path.resolve()), - ) - for j in range(per_file_symbols) - ] - per_file_symbols_list.append(symbols) - - relationships: list[CodeRelationship] = [] - # Intra-file edge: func_i_0 -> func_i_1 - relationships.append( - CodeRelationship( - source_symbol=f"func_{i}_0", - target_symbol=f"func_{i}_1", - relationship_type=RelationshipType.CALL, - source_file=str(file_path.resolve()), - target_file=str(file_path.resolve()), - source_line=1, - ) - ) - # Cross-file edge: func_i_0 -> func_(i+1)_0 (name-unique across dir) - j = (i + 1) % file_count - relationships.append( - CodeRelationship( - source_symbol=f"func_{i}_0", - target_symbol=f"func_{j}_0", - relationship_type=RelationshipType.CALL, - source_file=str(file_path.resolve()), - target_file=str((source_dir / f"m{j}.py").resolve()), - source_line=1, - ) - ) - per_file_relationships_list.append(relationships) - - store.add_file( - name=file_path.name, - full_path=file_path, - content="pass\n", - language="python", - symbols=symbols, - relationships=relationships, - ) - - # Precompute graph_neighbors for GraphExpander (precomputed Stage-2 build) - start = time.perf_counter() - _compute_graph_neighbors(store) - graph_build_ms = (time.perf_counter() - start) * 1000.0 - store.close() - - # Build global symbol index + relationships for GlobalGraphExpander - global_db_path = index_db_path.parent / GlobalSymbolIndex.DEFAULT_DB_NAME - global_index = GlobalSymbolIndex(global_db_path, project_id=1) - global_index.initialize() - try: - index_path_str = str(index_db_path.resolve()) - start = time.perf_counter() - for file_path, symbols in zip(file_paths, per_file_symbols_list): - file_path_str = str(file_path.resolve()) - global_index.update_file_symbols( - file_path_str, - symbols, - index_path=index_path_str, - ) - global_symbols_ms = (time.perf_counter() - start) * 1000.0 - - start = time.perf_counter() - for file_path, relationships in zip(file_paths, per_file_relationships_list): - file_path_str = str(file_path.resolve()) - global_index.update_file_relationships(file_path_str, relationships) - global_relationships_ms = (time.perf_counter() - start) * 1000.0 - - base_results = [ - SearchResult( - path=str(file_paths[i].resolve()), - score=1.0, - excerpt=None, - content=None, - start_line=1, - end_line=1, - symbol_name=f"func_{i}_0", - symbol_kind="function", - ) - for i in range(min(10, file_count)) - ] - - pre_expander = GraphExpander(mapper) - static_expander = GlobalGraphExpander(global_index) - - start = time.perf_counter() - pre_related = pre_expander.expand( - base_results, - depth=2, - max_expand=10, - max_related=50, - ) - pre_ms = (time.perf_counter() - start) * 1000.0 - - start = time.perf_counter() - static_related = static_expander.expand( - base_results, - top_n=10, - max_related=50, - ) - static_ms = (time.perf_counter() - start) * 1000.0 - - assert pre_related, "Expected precomputed graph expansion to return related results" - assert static_related, "Expected static global graph expansion to return related results" - - print("\nStage-2 build benchmark (30 files, 2 symbols/file):") - print(f" graph_neighbors precompute: {graph_build_ms:.2f}ms") - print(f" global_symbols write: {global_symbols_ms:.2f}ms") - print(f" global_relationships write: {global_relationships_ms:.2f}ms") - - print("\nStage-2 expansion benchmark (30 files, 2 symbols/file):") - print(f" precomputed (graph_neighbors): {pre_ms:.2f}ms, related={len(pre_related)}") - print(f" static_global_graph: {static_ms:.2f}ms, related={len(static_related)}") - finally: - global_index.close() - - def test_relationship_extraction_astgrep_vs_treesitter_benchmark(self, tmp_path): - """Informational benchmark: relationship extraction via ast-grep vs tree-sitter. - - Skips when optional parser dependencies are unavailable. - """ - import textwrap - - from codexlens.config import Config - from codexlens.parsers.astgrep_processor import is_astgrep_processor_available - from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser - - if not is_astgrep_processor_available(): - pytest.skip("ast-grep processor unavailable (optional dependency)") - - code = textwrap.dedent( - """ - import os - from typing import List - - class Base: - pass - - class Child(Base): - def method(self) -> List[str]: - return [os.path.join("a", "b")] - """ - ).lstrip() - - file_path = tmp_path / "sample.py" - file_path.write_text(code, encoding="utf-8") - - cfg_ts = Config(data_dir=tmp_path / "cfg_ts") - cfg_ts.use_astgrep = False - ts_parser = TreeSitterSymbolParser("python", file_path, config=cfg_ts) - if not ts_parser.is_available(): - pytest.skip("tree-sitter python binding unavailable") - - cfg_ag = Config(data_dir=tmp_path / "cfg_ag") - cfg_ag.use_astgrep = True - ag_parser = TreeSitterSymbolParser("python", file_path, config=cfg_ag) - if getattr(ag_parser, "_astgrep_processor", None) is None: - pytest.skip("ast-grep processor failed to initialize") - - def _bench(parser: TreeSitterSymbolParser) -> tuple[float, int]: - durations = [] - rel_counts = [] - for _ in range(3): - start = time.perf_counter() - indexed = parser.parse(code, file_path) - durations.append(time.perf_counter() - start) - rel_counts.append(0 if indexed is None else len(indexed.relationships)) - return min(durations) * 1000.0, max(rel_counts) - - ts_ms, ts_rels = _bench(ts_parser) - ag_ms, ag_rels = _bench(ag_parser) - - assert ts_rels > 0, "Expected relationships extracted via tree-sitter" - assert ag_rels > 0, "Expected relationships extracted via ast-grep" - - print("\nRelationship extraction benchmark (python, 1 file):") - print(f" tree-sitter: {ts_ms:.2f}ms, rels={ts_rels}") - print(f" ast-grep: {ag_ms:.2f}ms, rels={ag_rels}") - - def test_cross_encoder_reranking_latency_under_200ms(self): - """Cross-encoder rerank step completes under 200ms (excluding model load).""" - from codexlens.entities import SearchResult - from codexlens.search.ranking import cross_encoder_rerank - - query = "find function" - results = [ - SearchResult( - path=f"file_{idx}.py", - score=1.0 / (idx + 1), - excerpt=f"def func_{idx}():\n return {idx}", - symbol_name=f"func_{idx}", - symbol_kind="function", - ) - for idx in range(50) - ] - - class DummyReranker: - def score_pairs(self, pairs, batch_size=32): - _ = batch_size - # Return deterministic pseudo-logits to exercise sigmoid normalization. - return [float(i) for i in range(len(pairs))] - - reranker = DummyReranker() - - start = time.perf_counter() - reranked = cross_encoder_rerank(query, results, reranker, top_k=50, batch_size=32) - elapsed_ms = (time.perf_counter() - start) * 1000.0 - - assert len(reranked) == len(results) - assert any(r.metadata.get("cross_encoder_reranked") for r in reranked[:50]) - assert elapsed_ms < 200.0, f"Cross-encoder rerank too slow: {elapsed_ms:.1f}ms" diff --git a/codex-lens/tests/test_pure_vector_search.py b/codex-lens/tests/test_pure_vector_search.py deleted file mode 100644 index 3ba820fa..00000000 --- a/codex-lens/tests/test_pure_vector_search.py +++ /dev/null @@ -1,345 +0,0 @@ -"""Tests for pure vector search functionality.""" - -import pytest -import sqlite3 -import tempfile -import time -from pathlib import Path - -from codexlens.search.hybrid_search import HybridSearchEngine -from codexlens.storage.dir_index import DirIndexStore - -# Check if semantic dependencies are available -try: - from codexlens.semantic import SEMANTIC_AVAILABLE - SEMANTIC_DEPS_AVAILABLE = SEMANTIC_AVAILABLE -except ImportError: - SEMANTIC_DEPS_AVAILABLE = False - - -def _safe_unlink(path: Path, retries: int = 5, delay_s: float = 0.05) -> None: - """Best-effort unlink for Windows where SQLite can keep files locked briefly.""" - for attempt in range(retries): - try: - path.unlink() - return - except FileNotFoundError: - return - except PermissionError: - time.sleep(delay_s * (attempt + 1)) - try: - path.unlink(missing_ok=True) - except (PermissionError, OSError): - pass - - -class TestPureVectorSearch: - """Tests for pure vector search mode.""" - - @pytest.fixture - def sample_db(self): - """Create sample database with files.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - store = DirIndexStore(db_path) - store.initialize() - - # Add sample files - files = { - "auth.py": "def authenticate_user(username, password): pass", - "login.py": "def login_handler(credentials): pass", - "user.py": "class User: pass", - } - - with store._get_connection() as conn: - for path, content in files.items(): - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (path, path, content, "python", 0.0) - ) - conn.commit() - - yield db_path - store.close() - - if db_path.exists(): - _safe_unlink(db_path) - - def test_pure_vector_without_embeddings(self, sample_db): - """Test pure_vector mode returns empty when no embeddings exist.""" - engine = HybridSearchEngine() - - results = engine.search( - sample_db, - "authentication", - limit=10, - enable_vector=True, - pure_vector=True, - ) - - # Should return empty list because no embeddings exist - assert isinstance(results, list) - assert len(results) == 0, \ - "Pure vector search should return empty when no embeddings exist" - - def test_vector_with_fallback(self, sample_db): - """Test vector mode (with fallback) returns FTS results when no embeddings.""" - engine = HybridSearchEngine() - - results = engine.search( - sample_db, - "authenticate", - limit=10, - enable_vector=True, - pure_vector=False, # Allow FTS fallback - ) - - # Should return FTS results even without embeddings - assert isinstance(results, list) - assert len(results) > 0, \ - "Vector mode with fallback should return FTS results" - - # Verify results come from exact FTS - paths = [r.path for r in results] - assert "auth.py" in paths, "Should find auth.py via FTS" - - def test_pure_vector_invalid_config(self, sample_db): - """Test pure_vector=True but enable_vector=False logs warning.""" - engine = HybridSearchEngine() - - # Invalid: pure_vector=True but enable_vector=False - results = engine.search( - sample_db, - "test", - limit=10, - enable_vector=False, - pure_vector=True, - ) - - # Should fallback to exact search - assert isinstance(results, list) - - def test_hybrid_mode_ignores_pure_vector(self, sample_db): - """Test hybrid mode works normally (ignores pure_vector).""" - engine = HybridSearchEngine() - - results = engine.search( - sample_db, - "authenticate", - limit=10, - enable_fuzzy=True, - enable_vector=False, - pure_vector=False, # Should be ignored in hybrid - ) - - # Should return results from exact + fuzzy - assert isinstance(results, list) - assert len(results) > 0 - - -@pytest.mark.skipif(not SEMANTIC_DEPS_AVAILABLE, reason="Semantic dependencies not available") -class TestPureVectorWithEmbeddings: - """Tests for pure vector search with actual embeddings.""" - - @pytest.fixture - def db_with_embeddings(self): - """Create database with embeddings.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - store = DirIndexStore(db_path) - store.initialize() - - # Add sample files - files = { - "auth/authentication.py": """ -def authenticate_user(username: str, password: str) -> bool: - '''Verify user credentials against database.''' - return check_password(username, password) - -def check_password(user: str, pwd: str) -> bool: - '''Check if password matches stored hash.''' - return True -""", - "auth/login.py": """ -def login_handler(credentials: dict) -> bool: - '''Handle user login request.''' - username = credentials.get('username') - password = credentials.get('password') - return authenticate_user(username, password) -""", - } - - with store._get_connection() as conn: - for path, content in files.items(): - name = path.split('/')[-1] - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, path, content, "python", 0.0) - ) - conn.commit() - - # Generate embeddings - vector_store = None - try: - from codexlens.semantic.embedder import Embedder - from codexlens.semantic.vector_store import VectorStore - from codexlens.semantic.chunker import Chunker, ChunkConfig - - embedder = Embedder(profile="fast") # Use fast model for testing - vector_store = VectorStore(db_path) - chunker = Chunker(config=ChunkConfig(max_chunk_size=1000)) - - with sqlite3.connect(db_path) as conn: - conn.row_factory = sqlite3.Row - rows = conn.execute("SELECT full_path, content FROM files").fetchall() - - for row in rows: - chunks = chunker.chunk_sliding_window( - row["content"], - file_path=row["full_path"], - language="python" - ) - for chunk in chunks: - chunk.embedding = embedder.embed_single(chunk.content) - if chunks: - vector_store.add_chunks(chunks, row["full_path"]) - - except Exception as exc: - pytest.skip(f"Failed to generate embeddings: {exc}") - finally: - if vector_store is not None: - vector_store.close() - - yield db_path - store.close() - - if db_path.exists(): - _safe_unlink(db_path) - - def test_pure_vector_with_embeddings(self, db_with_embeddings): - """Test pure vector search returns results when embeddings exist.""" - engine = HybridSearchEngine() - - results = engine.search( - db_with_embeddings, - "how to verify user credentials", # Natural language query - limit=10, - enable_vector=True, - pure_vector=True, - ) - - # Should return results from vector search only - assert isinstance(results, list) - assert len(results) > 0, "Pure vector search should return results" - - # Results should have semantic relevance - for result in results: - assert result.score > 0 - assert result.path is not None - - def test_compare_pure_vs_hybrid(self, db_with_embeddings): - """Compare pure vector vs hybrid search results.""" - engine = HybridSearchEngine() - - # Pure vector search - pure_results = engine.search( - db_with_embeddings, - "verify credentials", - limit=10, - enable_vector=True, - pure_vector=True, - ) - - # Hybrid search - hybrid_results = engine.search( - db_with_embeddings, - "verify credentials", - limit=10, - enable_fuzzy=True, - enable_vector=True, - pure_vector=False, - ) - - # Both should return results - assert len(pure_results) > 0, "Pure vector should find results" - assert len(hybrid_results) > 0, "Hybrid should find results" - - # Hybrid may have more results (FTS + vector) - # But pure should still be useful for semantic queries - - -class TestSearchModeComparison: - """Compare different search modes.""" - - @pytest.fixture - def comparison_db(self): - """Create database for mode comparison.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - store = DirIndexStore(db_path) - store.initialize() - - files = { - "auth.py": "def authenticate(): pass", - "login.py": "def login(): pass", - } - - with store._get_connection() as conn: - for path, content in files.items(): - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (path, path, content, "python", 0.0) - ) - conn.commit() - - yield db_path - store.close() - - if db_path.exists(): - _safe_unlink(db_path) - - def test_mode_comparison_without_embeddings(self, comparison_db): - """Compare all search modes without embeddings.""" - engine = HybridSearchEngine() - query = "authenticate" - - # Test each mode - modes = [ - ("exact", False, False, False), - ("fuzzy", True, False, False), - ("vector", False, True, False), # With fallback - ("pure_vector", False, True, True), # No fallback - ] - - results = {} - for mode_name, fuzzy, vector, pure in modes: - result = engine.search( - comparison_db, - query, - limit=10, - enable_fuzzy=fuzzy, - enable_vector=vector, - pure_vector=pure, - ) - results[mode_name] = len(result) - - # Assertions - assert results["exact"] > 0, "Exact should find results" - assert results["fuzzy"] >= results["exact"], "Fuzzy should find at least as many" - assert results["vector"] > 0, "Vector with fallback should find results (from FTS)" - assert results["pure_vector"] == 0, "Pure vector should return empty (no embeddings)" - - # Log comparison - print("\nMode comparison (without embeddings):") - for mode, count in results.items(): - print(f" {mode}: {count} results") - - -if __name__ == "__main__": - pytest.main([__file__, "-v", "-s"]) diff --git a/codex-lens/tests/test_query_parser.py b/codex-lens/tests/test_query_parser.py deleted file mode 100644 index 0809538d..00000000 --- a/codex-lens/tests/test_query_parser.py +++ /dev/null @@ -1,485 +0,0 @@ -"""Tests for query preprocessing and expansion (P1). - -Tests identifier splitting (CamelCase, snake_case, kebab-case), OR expansion, -and FTS5 operator preservation. -""" - -import pytest - -from codexlens.search.query_parser import QueryParser, preprocess_query - - -class TestQueryParserBasics: - """Basic tests for QueryParser class.""" - - def test_parser_initialization(self): - """Test QueryParser initializes with default settings.""" - parser = QueryParser() - assert parser.enable is True - assert parser.min_token_length == 2 - - def test_parser_disabled(self): - """Test parser with enable=False returns original query.""" - parser = QueryParser(enable=False) - result = parser.preprocess_query("UserAuth") - assert result == "UserAuth" - - def test_empty_query(self): - """Test empty query returns empty string.""" - parser = QueryParser() - assert parser.preprocess_query("") == "" - assert parser.preprocess_query(" ") == "" - - -class TestCamelCaseSplitting: - """Tests for CamelCase identifier splitting.""" - - def test_simple_camelcase(self): - """Test simple CamelCase splitting.""" - parser = QueryParser() - result = parser.preprocess_query("UserAuth") - # Should expand to: UserAuth OR User OR Auth - assert "UserAuth" in result - assert "User" in result - assert "Auth" in result - assert "OR" in result - - def test_lowercase_camelcase(self): - """Test lowerCamelCase splitting.""" - parser = QueryParser() - result = parser.preprocess_query("getUserData") - # Should expand: getUserData OR get OR User OR Data - assert "getUserData" in result - assert "get" in result - assert "User" in result - assert "Data" in result - - def test_all_caps_acronym(self): - """Test all-caps acronyms are not split.""" - parser = QueryParser() - result = parser.preprocess_query("HTTP") - # Should not split HTTP - assert "HTTP" in result - assert "OR" not in result or result == "HTTP" - - def test_mixed_acronym_camelcase(self): - """Test mixed acronym and CamelCase.""" - parser = QueryParser() - result = parser.preprocess_query("HTTPServer") - # Should handle mixed case - assert "HTTPServer" in result or "HTTP" in result - - -class TestSnakeCaseSplitting: - """Tests for snake_case identifier splitting.""" - - def test_simple_snake_case(self): - """Test simple snake_case splitting.""" - parser = QueryParser() - result = parser.preprocess_query("user_auth") - # Should expand: user_auth OR user OR auth - assert "user_auth" in result - assert "user" in result - assert "auth" in result - assert "OR" in result - - def test_multiple_underscores(self): - """Test splitting with multiple underscores.""" - parser = QueryParser() - result = parser.preprocess_query("get_user_data") - # Should expand: get_user_data OR get OR user OR data - assert "get_user_data" in result - assert "get" in result - assert "user" in result - assert "data" in result - - def test_leading_trailing_underscores(self): - """Test underscores at start/end.""" - parser = QueryParser() - result = parser.preprocess_query("_private_method_") - # Should handle gracefully - assert "private" in result - assert "method" in result - - -class TestKebabCaseSplitting: - """Tests for kebab-case identifier splitting.""" - - def test_simple_kebab_case(self): - """Test simple kebab-case splitting.""" - parser = QueryParser() - result = parser.preprocess_query("user-auth") - # Should expand: user-auth OR user OR auth - assert "user-auth" in result or "user" in result - assert "OR" in result - - def test_multiple_hyphens(self): - """Test splitting with multiple hyphens.""" - parser = QueryParser() - result = parser.preprocess_query("get-user-data") - # Should expand similar to snake_case - assert "get" in result - assert "user" in result - assert "data" in result - - -class TestQueryExpansion: - """Tests for OR query expansion.""" - - def test_expansion_includes_original(self): - """Test expansion always includes original query.""" - parser = QueryParser() - result = parser.preprocess_query("UserAuth") - # Original should be first - tokens = result.split(" OR ") - assert tokens[0] == "UserAuth" - - def test_expansion_or_operator(self): - """Test expansion uses OR operator.""" - parser = QueryParser() - result = parser.preprocess_query("getUserData") - assert " OR " in result - - def test_min_token_length_filtering(self): - """Test short tokens are filtered out.""" - parser = QueryParser(min_token_length=3) - result = parser.preprocess_query("getX") - # "X" should be filtered (len < 3) - assert "X" not in result or "getX" in result - assert "get" in result # "get" has len=3 - - def test_no_expansion_for_simple_word(self): - """Test simple words with no splitting return as-is.""" - parser = QueryParser() - result = parser.preprocess_query("function") - # No splitting needed, but may still have OR if single token - assert "function" in result - - def test_deduplication(self): - """Test duplicate tokens are deduplicated.""" - parser = QueryParser() - # Query that might produce duplicates after splitting - result = parser.preprocess_query("user_user") - tokens = result.split(" OR ") - # Should deduplicate "user" - user_count = tokens.count("user") - assert user_count == 1 - - -class TestFTS5OperatorPreservation: - """Tests for FTS5 operator preservation.""" - - def test_quoted_phrase_not_expanded(self): - """Test quoted phrases are not expanded.""" - parser = QueryParser() - result = parser.preprocess_query('"UserAuth"') - # Should preserve quoted phrase without expansion - assert result == '"UserAuth"' or '"UserAuth"' in result - - def test_or_operator_not_expanded(self): - """Test existing OR operator preserves query.""" - parser = QueryParser() - result = parser.preprocess_query("user OR auth") - # Should not double-expand - assert result == "user OR auth" - - def test_and_operator_not_expanded(self): - """Test AND operator preserves query.""" - parser = QueryParser() - result = parser.preprocess_query("user AND auth") - assert result == "user AND auth" - - def test_not_operator_not_expanded(self): - """Test NOT operator preserves query.""" - parser = QueryParser() - result = parser.preprocess_query("user NOT test") - assert result == "user NOT test" - - def test_near_operator_not_expanded(self): - """Test NEAR operator preserves query.""" - parser = QueryParser() - result = parser.preprocess_query("user NEAR auth") - assert result == "user NEAR auth" - - def test_wildcard_not_expanded(self): - """Test wildcard queries are not expanded.""" - parser = QueryParser() - result = parser.preprocess_query("auth*") - assert result == "auth*" - - def test_prefix_operator_not_expanded(self): - """Test prefix operator (^) preserves query.""" - parser = QueryParser() - result = parser.preprocess_query("^auth") - assert result == "^auth" - - -class TestMultiWordQueries: - """Tests for multi-word query expansion.""" - - def test_two_words(self): - """Test expansion of two-word query.""" - parser = QueryParser() - result = parser.preprocess_query("UserAuth DataModel") - # Should expand each word - assert "UserAuth" in result - assert "DataModel" in result - assert "User" in result - assert "Auth" in result - assert "Data" in result - assert "Model" in result - - def test_whitespace_separated_identifiers(self): - """Test whitespace-separated identifiers are expanded.""" - parser = QueryParser() - result = parser.preprocess_query("get_user create_token") - # Each word should be expanded - assert "get" in result - assert "user" in result - assert "create" in result - assert "token" in result - - -class TestConvenienceFunction: - """Tests for preprocess_query convenience function.""" - - def test_convenience_function_default(self): - """Test convenience function with default settings.""" - result = preprocess_query("UserAuth") - assert "UserAuth" in result - assert "OR" in result - - def test_convenience_function_disabled(self): - """Test convenience function with enable=False.""" - result = preprocess_query("UserAuth", enable=False) - assert result == "UserAuth" - - -@pytest.mark.parametrize("query,expected_tokens", [ - ("UserAuth", ["UserAuth", "User", "Auth"]), - ("user_auth", ["user_auth", "user", "auth"]), - ("get-user-data", ["get", "user", "data"]), - ("HTTPServer", ["HTTPServer", "HTTP", "Server"]), - ("getUserData", ["getUserData", "get", "User", "Data"]), -]) -class TestParameterizedSplitting: - """Parameterized tests for various identifier formats.""" - - def test_identifier_splitting(self, query, expected_tokens): - """Test identifier splitting produces expected tokens.""" - parser = QueryParser() - result = parser.preprocess_query(query) - - # Check all expected tokens are present - for token in expected_tokens: - assert token in result, f"Token '{token}' should be in result: {result}" - - -class TestEdgeCases: - """Edge case tests for query parsing.""" - - def test_single_character_word(self): - """Test single character words are filtered.""" - parser = QueryParser(min_token_length=2) - result = parser.preprocess_query("a") - # Single char should be filtered if below min_token_length - assert result == "a" or len(result) == 0 or result.strip() == "" - - def test_numbers_in_identifiers(self): - """Test identifiers with numbers.""" - parser = QueryParser() - result = parser.preprocess_query("user123Auth") - # Should handle numbers gracefully - assert "user123Auth" in result - - def test_special_characters(self): - """Test identifiers with special characters.""" - parser = QueryParser() - result = parser.preprocess_query("user$auth") - # Should handle special chars - assert isinstance(result, str) - - def test_unicode_identifiers(self): - """Test Unicode identifiers.""" - parser = QueryParser() - result = parser.preprocess_query("用户认证") - # Should handle Unicode without errors - assert isinstance(result, str) - assert "用户认证" in result - - def test_very_long_identifier(self): - """Test very long identifier names.""" - parser = QueryParser() - long_name = "VeryLongCamelCaseIdentifierNameThatExceedsNormalLength" - result = parser.preprocess_query(long_name) - # Should handle long names - assert long_name in result - - def test_mixed_case_styles(self): - """Test mixed CamelCase and snake_case.""" - parser = QueryParser() - result = parser.preprocess_query("User_Auth") - # Should handle mixed styles - assert "User_Auth" in result or "User" in result - assert "Auth" in result - - -class TestTokenExtractionLogic: - """Tests for internal token extraction logic.""" - - def test_extract_tokens_from_camelcase(self): - """Test _split_camel_case method.""" - parser = QueryParser() - tokens = parser._split_camel_case("getUserData") - # Should split into: get, User, Data - assert "get" in tokens - assert "User" in tokens - assert "Data" in tokens - - def test_extract_tokens_from_snake_case(self): - """Test _split_snake_case method.""" - parser = QueryParser() - tokens = parser._split_snake_case("get_user_data") - # Should split into: get, user, data - assert "get" in tokens - assert "user" in tokens - assert "data" in tokens - - def test_extract_tokens_from_kebab_case(self): - """Test _split_kebab_case method.""" - parser = QueryParser() - tokens = parser._split_kebab_case("get-user-data") - # Should split into: get, user, data - assert "get" in tokens - assert "user" in tokens - assert "data" in tokens - - def test_extract_tokens_combines_strategies(self): - """Test _extract_tokens uses all splitting strategies.""" - parser = QueryParser() - # Mix of styles - tokens = parser._extract_tokens("getUserData_v2") - # Should extract: getUserData_v2, get, User, Data, v2 - assert "getUserData_v2" in tokens - assert "get" in tokens or "User" in tokens - - -class TestQueryParserIntegration: - """Integration tests for query parser.""" - - def test_real_world_query_examples(self): - """Test real-world query examples.""" - parser = QueryParser() - - queries = [ - "AuthenticationService", - "get_user_by_id", - "create-new-user", - "HTTPRequest", - "parseJSONData", - ] - - for query in queries: - result = parser.preprocess_query(query) - # Should produce valid expanded query - assert isinstance(result, str) - assert len(result) > 0 - assert query in result # Original should be included - - def test_parser_performance(self): - """Test parser performance with many queries.""" - parser = QueryParser() - - # Process 1000 queries - for i in range(1000): - query = f"getUserData{i}" - result = parser.preprocess_query(query) - assert isinstance(result, str) - - -class TestMinTokenLength: - """Tests for min_token_length parameter.""" - - def test_custom_min_token_length(self): - """Test custom min_token_length filters tokens.""" - parser = QueryParser(min_token_length=4) - result = parser.preprocess_query("getUserData") - # Tokens with len < 4 should be filtered - assert "get" not in result or "getUserData" in result # "get" has len=3 - assert "User" in result # "User" has len=4 - assert "Data" in result # "Data" has len=4 - - def test_min_token_length_zero(self): - """Test min_token_length=0 includes all tokens.""" - parser = QueryParser(min_token_length=0) - result = parser.preprocess_query("getX") - # All tokens should be included - assert "get" in result - assert "X" in result or "getX" in result - - def test_min_token_length_one(self): - """Test min_token_length=1 includes single char tokens.""" - parser = QueryParser(min_token_length=1) - result = parser.preprocess_query("aB") - # Should include "a" and "B" - assert "a" in result or "aB" in result - assert "B" in result or "aB" in result - - - - -class TestComplexBooleanQueries: - """Tests for complex boolean query parsing.""" - - @pytest.fixture - def parser(self): - return QueryParser() - - def test_nested_boolean_and_or(self, parser): - """Test parser preserves nested boolean logic: (A OR B) AND C.""" - query = "(login OR logout) AND user" - expanded = parser.preprocess_query(query) - - # Should preserve parentheses and boolean operators - assert "(" in expanded - assert ")" in expanded - assert "AND" in expanded - assert "OR" in expanded - - def test_mixed_operators_with_expansion(self, parser): - """Test CamelCase expansion doesn't break boolean operators.""" - query = "UserAuth AND (login OR logout)" - expanded = parser.preprocess_query(query) - - # Should expand UserAuth but preserve operators - assert "User" in expanded or "Auth" in expanded - assert "AND" in expanded - assert "OR" in expanded - assert "(" in expanded - - def test_quoted_phrases_with_boolean(self, parser): - """Test quoted phrases preserved with boolean operators.""" - query = '"user authentication" AND login' - expanded = parser.preprocess_query(query) - - # Quoted phrase should remain intact - assert '"user authentication"' in expanded or '"' in expanded - assert "AND" in expanded - - def test_not_operator_preservation(self, parser): - """Test NOT operator is preserved correctly.""" - query = "login NOT logout" - expanded = parser.preprocess_query(query) - - assert "NOT" in expanded - assert "login" in expanded - assert "logout" in expanded - - def test_complex_nested_three_levels(self, parser): - """Test deeply nested boolean logic: ((A OR B) AND C) OR D.""" - query = "((UserAuth OR login) AND session) OR token" - expanded = parser.preprocess_query(query) - - # Should handle multiple nesting levels - assert expanded.count("(") >= 2 # At least 2 opening parens - assert expanded.count(")") >= 2 # At least 2 closing parens diff --git a/codex-lens/tests/test_ranking.py b/codex-lens/tests/test_ranking.py deleted file mode 100644 index a082d22e..00000000 --- a/codex-lens/tests/test_ranking.py +++ /dev/null @@ -1,782 +0,0 @@ -"""Unit tests for ranking.py - RRF weights, intent detection, score fusion, and filtering. - -Tests cover: -- detect_query_intent: CamelCase/underscore -> KEYWORD, natural language -> SEMANTIC, mixed -- adjust_weights_by_intent: Weight adjustments per intent type -- get_rrf_weights: Composite of detect + adjust -- reciprocal_rank_fusion: Single/multi source, empty, weight normalization -- simple_weighted_fusion: Basic fusion and empty input -- apply_symbol_boost: Symbol match boost and no-match scenario -- filter_results_by_category: KEYWORD -> code only, SEMANTIC -> docs priority -- group_similar_results: Group results by score proximity -- normalize_weights: All-zero weights edge case -""" - -from __future__ import annotations - -import math -from typing import Dict, List -from unittest.mock import MagicMock - -import pytest - -from codexlens.entities import SearchResult -from codexlens.search.ranking import ( - DEFAULT_WEIGHTS, - QueryIntent, - apply_path_penalties, - extract_explicit_path_hints, - cross_encoder_rerank, - adjust_weights_by_intent, - apply_symbol_boost, - detect_query_intent, - filter_results_by_category, - get_rrf_weights, - group_similar_results, - is_auxiliary_reference_path, - is_generated_artifact_path, - is_test_file, - normalize_weights, - query_prefers_lexical_search, - query_targets_auxiliary_files, - query_targets_generated_files, - query_targets_test_files, - rebalance_noisy_results, - reciprocal_rank_fusion, - simple_weighted_fusion, -) - - -# ============================================================================= -# Helpers -# ============================================================================= - - -def _make_result( - path: str = "a.py", - score: float = 0.5, - excerpt: str = "def foo():", - symbol_name: str | None = None, - symbol_kind: str | None = None, - start_line: int | None = None, - end_line: int | None = None, -) -> SearchResult: - """Create a SearchResult with sensible defaults.""" - return SearchResult( - path=path, - score=score, - excerpt=excerpt, - symbol_name=symbol_name, - symbol_kind=symbol_kind, - start_line=start_line, - end_line=end_line, - ) - - -# ============================================================================= -# Tests: detect_query_intent -# ============================================================================= - - -class TestDetectQueryIntent: - """Tests for detect_query_intent().""" - - def test_detect_keyword_intent(self): - """CamelCase/underscore queries should be detected as KEYWORD.""" - assert detect_query_intent("MyClassName") == QueryIntent.KEYWORD - assert detect_query_intent("windowsHide") == QueryIntent.KEYWORD - assert detect_query_intent("my_function_name") == QueryIntent.KEYWORD - assert detect_query_intent("foo::bar") == QueryIntent.KEYWORD - - def test_detect_semantic_intent(self): - """Natural language queries should be detected as SEMANTIC.""" - assert detect_query_intent("how to authenticate users safely?") == QueryIntent.SEMANTIC - assert detect_query_intent("explain the login process") == QueryIntent.SEMANTIC - - def test_detect_mixed_intent(self): - """Queries with both code and NL signals should be MIXED.""" - # Has code signal (underscore identifier) and NL signal ("how") - assert detect_query_intent("how does my_function work") == QueryIntent.MIXED - - def test_detect_empty_query(self): - """Empty string should return MIXED (safe default).""" - assert detect_query_intent("") == QueryIntent.MIXED - assert detect_query_intent(" ") == QueryIntent.MIXED - - def test_query_targets_test_files(self): - """Queries explicitly mentioning tests should skip test penalties.""" - assert query_targets_test_files("how do tests cover auth flow?") - assert query_targets_test_files("spec fixtures for parser") - assert not query_targets_test_files("windowsHide") - - def test_query_targets_generated_files(self): - """Queries explicitly mentioning build artifacts should skip that penalty.""" - assert query_targets_generated_files("inspect dist bundle output") - assert query_targets_generated_files("generated artifacts under build") - assert not query_targets_generated_files("cache invalidation strategy") - - def test_query_prefers_lexical_search(self): - """Config/env/factory queries should prefer lexical-first routing.""" - assert query_prefers_lexical_search("embedding backend fastembed local litellm api config") - assert query_prefers_lexical_search("get_reranker factory onnx backend selection") - assert query_prefers_lexical_search("EMBEDDING_BACKEND and RERANKER_BACKEND environment variables") - assert not query_prefers_lexical_search("how does smart search route keyword queries") - - -# ============================================================================= -# Tests: adjust_weights_by_intent -# ============================================================================= - - -class TestAdjustWeightsByIntent: - """Tests for adjust_weights_by_intent().""" - - def test_adjust_keyword_weights(self): - """KEYWORD intent should boost exact and reduce vector.""" - base = {"exact": 0.3, "fuzzy": 0.1, "vector": 0.6} - adjusted = adjust_weights_by_intent(QueryIntent.KEYWORD, base) - # Expected target: exact:0.5, fuzzy:0.1, vector:0.4 - assert adjusted["exact"] == pytest.approx(0.5, abs=0.01) - assert adjusted["fuzzy"] == pytest.approx(0.1, abs=0.01) - assert adjusted["vector"] == pytest.approx(0.4, abs=0.01) - - def test_adjust_semantic_weights(self): - """SEMANTIC intent should boost vector and reduce exact.""" - base = {"exact": 0.3, "fuzzy": 0.1, "vector": 0.6} - adjusted = adjust_weights_by_intent(QueryIntent.SEMANTIC, base) - # Expected target: exact:0.2, fuzzy:0.1, vector:0.7 - assert adjusted["exact"] == pytest.approx(0.2, abs=0.01) - assert adjusted["fuzzy"] == pytest.approx(0.1, abs=0.01) - assert adjusted["vector"] == pytest.approx(0.7, abs=0.01) - - def test_adjust_mixed_weights(self): - """MIXED intent should return normalized base_weights.""" - base = {"exact": 0.3, "fuzzy": 0.1, "vector": 0.6} - adjusted = adjust_weights_by_intent(QueryIntent.MIXED, base) - # MIXED returns normalized base_weights - total = sum(adjusted.values()) - assert total == pytest.approx(1.0, abs=0.01) - # Proportions should be preserved - assert adjusted["exact"] == pytest.approx(0.3, abs=0.01) - - -class TestPathPenalties: - """Tests for lightweight path-based ranking penalties.""" - - def test_is_test_file(self): - assert is_test_file("/repo/tests/test_auth.py") - assert is_test_file("D:\\repo\\src\\auth.spec.ts") - assert is_test_file("/repo/frontend/src/pages/discoverypage.test.tsx") - assert is_test_file("/repo/frontend/src/pages/discoverypage.spec.jsx") - assert not is_test_file("/repo/src/auth.py") - - def test_is_generated_artifact_path(self): - assert is_generated_artifact_path("/repo/dist/app.js") - assert is_generated_artifact_path("/repo/src/generated/client.ts") - assert is_generated_artifact_path("D:\\repo\\frontend\\.next\\server.js") - assert not is_generated_artifact_path("/repo/src/auth.py") - - def test_is_auxiliary_reference_path(self): - assert is_auxiliary_reference_path("/repo/examples/auth_demo.py") - assert is_auxiliary_reference_path("/repo/benchmarks/search_eval.py") - assert is_auxiliary_reference_path("/repo/tools/debug_search.py") - assert not is_auxiliary_reference_path("/repo/src/auth.py") - - def test_query_targets_auxiliary_files(self): - assert query_targets_auxiliary_files("show smart search examples") - assert query_targets_auxiliary_files("benchmark smart search") - assert not query_targets_auxiliary_files("smart search routing") - - def test_apply_path_penalties_demotes_test_files(self): - results = [ - _make_result(path="/repo/tests/test_auth.py", score=10.0), - _make_result(path="/repo/src/auth.py", score=9.0), - ] - - penalized = apply_path_penalties( - results, - "authenticate user", - test_file_penalty=0.15, - ) - - assert penalized[0].path == "/repo/src/auth.py" - assert penalized[1].metadata["path_penalty_reasons"] == ["test_file"] - - def test_apply_path_penalties_more_aggressively_demotes_tests_for_keyword_queries(self): - results = [ - _make_result(path="/repo/tests/test_auth.py", score=5.0), - _make_result(path="/repo/src/auth.py", score=4.0), - ] - - penalized = apply_path_penalties( - results, - "find_descendant_project_roots", - test_file_penalty=0.15, - ) - - assert penalized[0].path == "/repo/src/auth.py" - assert penalized[1].metadata["path_penalty_reasons"] == ["test_file"] - assert penalized[1].metadata["path_penalty_multiplier"] == pytest.approx(0.55) - assert penalized[1].metadata["path_rank_multiplier"] == pytest.approx(0.55) - - def test_apply_path_penalties_more_aggressively_demotes_tests_for_semantic_queries(self): - results = [ - _make_result(path="/repo/tests/test_auth.py", score=5.0), - _make_result(path="/repo/src/auth.py", score=4.1), - ] - - penalized = apply_path_penalties( - results, - "how does auth routing work", - test_file_penalty=0.15, - ) - - assert penalized[0].path == "/repo/src/auth.py" - assert penalized[1].metadata["path_penalty_reasons"] == ["test_file"] - assert penalized[1].metadata["path_penalty_multiplier"] == pytest.approx(0.75) - - def test_apply_path_penalties_boosts_source_definitions_for_identifier_queries(self): - results = [ - _make_result( - path="/repo/tests/test_registry.py", - score=4.2, - excerpt='query="find_descendant_project_roots"', - ), - _make_result( - path="/repo/src/registry.py", - score=3.0, - excerpt="def find_descendant_project_roots(self, source_root: Path) -> list[str]:", - ), - ] - - penalized = apply_path_penalties( - results, - "find_descendant_project_roots", - test_file_penalty=0.15, - ) - - assert penalized[0].path == "/repo/src/registry.py" - assert penalized[0].metadata["path_boost_reasons"] == ["source_definition"] - assert penalized[0].metadata["path_boost_multiplier"] == pytest.approx(2.0) - assert penalized[0].metadata["path_rank_multiplier"] == pytest.approx(2.0) - assert penalized[1].metadata["path_penalty_reasons"] == ["test_file"] - - def test_apply_path_penalties_boosts_source_paths_for_semantic_feature_queries(self): - results = [ - _make_result( - path="/repo/tests/smart-search-intent.test.js", - score=0.832, - excerpt="describes how smart search routes keyword queries", - ), - _make_result( - path="/repo/src/tools/smart-search.ts", - score=0.555, - excerpt="smart search keyword routing logic", - ), - ] - - penalized = apply_path_penalties( - results, - "how does smart search route keyword queries", - test_file_penalty=0.15, - ) - - assert penalized[0].path == "/repo/src/tools/smart-search.ts" - assert penalized[0].metadata["path_boost_reasons"] == ["source_path_topic_overlap"] - assert penalized[0].metadata["path_boost_multiplier"] == pytest.approx(1.35) - assert penalized[0].metadata["path_boost_overlap_tokens"] == ["smart", "search"] - assert penalized[1].metadata["path_penalty_reasons"] == ["test_file"] - - def test_apply_path_penalties_strongly_boosts_keyword_basename_overlap(self): - results = [ - _make_result( - path="/repo/src/tools/core-memory.ts", - score=0.04032417772512223, - excerpt="memory listing helpers", - ), - _make_result( - path="/repo/src/tools/smart-search.ts", - score=0.009836065573770493, - excerpt="smart search keyword routing logic", - ), - ] - - penalized = apply_path_penalties( - results, - "executeHybridMode dense_rerank semantic smart_search", - test_file_penalty=0.15, - ) - - assert penalized[0].path == "/repo/src/tools/smart-search.ts" - assert penalized[0].metadata["path_boost_reasons"] == ["source_path_topic_overlap"] - assert penalized[0].metadata["path_boost_multiplier"] == pytest.approx(4.5) - assert penalized[0].metadata["path_boost_overlap_tokens"] == ["smart", "search"] - - def test_extract_explicit_path_hints_ignores_generic_platform_terms(self): - assert extract_explicit_path_hints( - "parse CodexLens JSON output strip ANSI smart_search", - ) == [["smart", "search"]] - - def test_apply_path_penalties_prefers_explicit_feature_hint_over_platform_terms(self): - results = [ - _make_result( - path="/repo/src/tools/codex-lens-lsp.ts", - score=0.045, - excerpt="CodexLens LSP bridge", - ), - _make_result( - path="/repo/src/tools/smart-search.ts", - score=0.03, - excerpt="parse JSON output and strip ANSI for plain-text fallback", - ), - ] - - penalized = apply_path_penalties( - results, - "parse CodexLens JSON output strip ANSI smart_search", - test_file_penalty=0.15, - ) - - assert penalized[0].path == "/repo/src/tools/smart-search.ts" - assert penalized[0].metadata["path_boost_reasons"] == ["source_path_topic_overlap"] - assert penalized[0].metadata["path_boost_overlap_tokens"] == ["smart", "search"] - - def test_apply_path_penalties_strongly_boosts_lexical_config_modules(self): - results = [ - _make_result( - path="/repo/src/tools/smart-search.ts", - score=22.07, - excerpt="embedding backend local api config routing", - ), - _make_result( - path="/repo/src/codexlens/config.py", - score=4.88, - excerpt="embedding_backend = 'fastembed'", - ), - ] - - penalized = apply_path_penalties( - results, - "embedding backend fastembed local litellm api config", - test_file_penalty=0.15, - ) - - assert penalized[0].path == "/repo/src/codexlens/config.py" - assert penalized[0].metadata["path_boost_reasons"] == ["source_path_topic_overlap"] - assert penalized[0].metadata["path_boost_multiplier"] == pytest.approx(5.0) - assert penalized[0].metadata["path_boost_overlap_tokens"] == ["config"] - - def test_apply_path_penalties_more_aggressively_demotes_tests_for_explicit_feature_queries(self): - results = [ - _make_result( - path="/repo/tests/smart-search-intent.test.js", - score=1.0, - excerpt="smart search intent coverage", - ), - _make_result( - path="/repo/src/tools/smart-search.ts", - score=0.58, - excerpt="plain-text JSON fallback for smart search", - ), - ] - - penalized = apply_path_penalties( - results, - "parse CodexLens JSON output strip ANSI smart_search", - test_file_penalty=0.15, - ) - - assert penalized[0].path == "/repo/src/tools/smart-search.ts" - assert penalized[1].metadata["path_penalty_reasons"] == ["test_file"] - assert penalized[1].metadata["path_penalty_multiplier"] == pytest.approx(0.55) - - def test_apply_path_penalties_demotes_generated_artifacts(self): - results = [ - _make_result(path="/repo/dist/auth.js", score=10.0), - _make_result(path="/repo/src/auth.ts", score=9.0), - ] - - penalized = apply_path_penalties( - results, - "authenticate user", - generated_file_penalty=0.35, - ) - - assert penalized[0].path == "/repo/src/auth.ts" - assert penalized[1].metadata["path_penalty_reasons"] == ["generated_artifact"] - - def test_apply_path_penalties_more_aggressively_demotes_generated_artifacts_for_explicit_feature_queries(self): - results = [ - _make_result( - path="/repo/dist/tools/smart-search.js", - score=1.0, - excerpt="built smart search output", - ), - _make_result( - path="/repo/src/tools/smart-search.ts", - score=0.45, - excerpt="plain-text JSON fallback for smart search", - ), - ] - - penalized = apply_path_penalties( - results, - "parse CodexLens JSON output strip ANSI smart_search", - generated_file_penalty=0.35, - ) - - assert penalized[0].path == "/repo/src/tools/smart-search.ts" - assert penalized[1].metadata["path_penalty_reasons"] == ["generated_artifact"] - assert penalized[1].metadata["path_penalty_multiplier"] == pytest.approx(0.4) - - def test_apply_path_penalties_demotes_auxiliary_reference_files(self): - results = [ - _make_result(path="/repo/examples/simple_search_comparison.py", score=10.0), - _make_result(path="/repo/src/search/router.py", score=9.0), - ] - - penalized = apply_path_penalties( - results, - "how does smart search route keyword queries", - test_file_penalty=0.15, - ) - - assert penalized[0].path == "/repo/src/search/router.py" - assert penalized[1].metadata["path_penalty_reasons"] == ["auxiliary_file"] - - def test_apply_path_penalties_more_aggressively_demotes_auxiliary_files_for_explicit_feature_queries(self): - results = [ - _make_result( - path="/repo/benchmarks/smart_search_demo.py", - score=1.0, - excerpt="demo for smart search fallback", - ), - _make_result( - path="/repo/src/tools/smart-search.ts", - score=0.52, - excerpt="plain-text JSON fallback for smart search", - ), - ] - - penalized = apply_path_penalties( - results, - "parse CodexLens JSON output strip ANSI smart_search", - test_file_penalty=0.15, - ) - - assert penalized[0].path == "/repo/src/tools/smart-search.ts" - assert penalized[1].metadata["path_penalty_reasons"] == ["auxiliary_file"] - assert penalized[1].metadata["path_penalty_multiplier"] == pytest.approx(0.5) - - def test_apply_path_penalties_skips_when_query_targets_tests(self): - results = [ - _make_result(path="/repo/tests/test_auth.py", score=10.0), - _make_result(path="/repo/src/auth.py", score=9.0), - ] - - penalized = apply_path_penalties( - results, - "auth tests", - test_file_penalty=0.15, - ) - - assert penalized[0].path == "/repo/tests/test_auth.py" - - def test_apply_path_penalties_skips_generated_penalty_when_query_targets_artifacts(self): - results = [ - _make_result(path="/repo/dist/auth.js", score=10.0), - _make_result(path="/repo/src/auth.ts", score=9.0), - ] - - penalized = apply_path_penalties( - results, - "dist auth bundle", - generated_file_penalty=0.35, - ) - - assert penalized[0].path == "/repo/dist/auth.js" - - def test_rebalance_noisy_results_pushes_explicit_feature_query_noise_behind_source_files(self): - results = [ - _make_result(path="/repo/src/tools/smart-search.ts", score=0.9), - _make_result(path="/repo/tests/smart-search-intent.test.tsx", score=0.8), - _make_result(path="/repo/src/core/cli-routes.ts", score=0.7), - _make_result(path="/repo/dist/tools/smart-search.js", score=0.6), - _make_result(path="/repo/benchmarks/smart_search_demo.py", score=0.5), - ] - - rebalanced = rebalance_noisy_results( - results, - "parse CodexLens JSON output strip ANSI smart_search", - ) - - assert [item.path for item in rebalanced[:2]] == [ - "/repo/src/tools/smart-search.ts", - "/repo/src/core/cli-routes.ts", - ] - - def test_rebalance_noisy_results_preserves_tests_when_query_targets_them(self): - results = [ - _make_result(path="/repo/tests/smart-search-intent.test.tsx", score=0.9), - _make_result(path="/repo/src/tools/smart-search.ts", score=0.8), - ] - - rebalanced = rebalance_noisy_results(results, "smart search tests") - - assert [item.path for item in rebalanced] == [ - "/repo/tests/smart-search-intent.test.tsx", - "/repo/src/tools/smart-search.ts", - ] - - def test_apply_path_penalties_skips_auxiliary_penalty_when_query_targets_examples(self): - results = [ - _make_result(path="/repo/examples/simple_search_comparison.py", score=10.0), - _make_result(path="/repo/src/search/router.py", score=9.0), - ] - - penalized = apply_path_penalties( - results, - "smart search examples", - test_file_penalty=0.15, - ) - - assert penalized[0].path == "/repo/examples/simple_search_comparison.py" - - -class TestCrossEncoderRerank: - """Tests for cross-encoder reranking edge cases.""" - - def test_cross_encoder_rerank_preserves_strong_source_candidates_for_semantic_feature_queries(self): - class DummyReranker: - def score_pairs(self, pairs, batch_size=32): - _ = (pairs, batch_size) - return [0.8323705792427063, 1.2463066923373844e-05] - - reranked = cross_encoder_rerank( - "how does smart search route keyword queries", - [ - _make_result( - path="/repo/tests/smart-search-intent.test.js", - score=0.5989155769348145, - excerpt="describes how smart search routes keyword queries", - ), - _make_result( - path="/repo/src/tools/smart-search.ts", - score=0.554444432258606, - excerpt="smart search keyword routing logic", - ), - ], - DummyReranker(), - top_k=2, - ) - reranked = apply_path_penalties( - reranked, - "how does smart search route keyword queries", - test_file_penalty=0.15, - ) - - assert reranked[0].path == "/repo/src/tools/smart-search.ts" - assert reranked[0].metadata["cross_encoder_floor_reason"] == "semantic_source_path_overlap" - assert reranked[0].metadata["cross_encoder_floor_overlap_tokens"] == ["smart", "search"] - assert reranked[0].metadata["path_boost_reasons"] == ["source_path_topic_overlap"] - assert reranked[1].metadata["path_penalty_reasons"] == ["test_file"] - -# ============================================================================= -# Tests: get_rrf_weights -# ============================================================================= - - -class TestGetRrfWeights: - """Tests for get_rrf_weights() composite function.""" - - def test_get_rrf_weights_composite(self): - """get_rrf_weights should compose detect_query_intent + adjust_weights_by_intent.""" - base = {"exact": 0.3, "fuzzy": 0.1, "vector": 0.6} - # Keyword-like query - weights = get_rrf_weights("MyClassName", base) - # MyClassName -> KEYWORD -> exact boosted - assert weights["exact"] > weights["fuzzy"] - - -# ============================================================================= -# Tests: reciprocal_rank_fusion -# ============================================================================= - - -class TestReciprocalRankFusion: - """Tests for reciprocal_rank_fusion().""" - - def test_rrf_single_source(self): - """Single source RRF should produce ranked results.""" - results = { - "exact": [ - _make_result(path="a.py", score=10.0), - _make_result(path="b.py", score=5.0), - ] - } - fused = reciprocal_rank_fusion(results) - assert len(fused) == 2 - # a.py should rank higher (rank 1) - assert fused[0].path == "a.py" - assert fused[0].score > fused[1].score - - def test_rrf_multi_source(self): - """Multi-source RRF should combine rankings from multiple sources.""" - results = { - "exact": [ - _make_result(path="a.py", score=10.0), - _make_result(path="b.py", score=5.0), - ], - "vector": [ - _make_result(path="b.py", score=0.9), - _make_result(path="c.py", score=0.8), - ], - } - weights = {"exact": 0.5, "vector": 0.5} - fused = reciprocal_rank_fusion(results, weights=weights) - # b.py appears in both sources - should have highest fusion score - assert len(fused) == 3 - assert fused[0].path == "b.py" - assert fused[0].metadata["fusion_method"] == "rrf" - - def test_rrf_empty_results(self): - """Empty results map should return empty list.""" - assert reciprocal_rank_fusion({}) == [] - - def test_rrf_weight_normalization(self): - """Weights not summing to 1.0 should be auto-normalized.""" - results = { - "exact": [_make_result(path="a.py", score=10.0)], - } - weights = {"exact": 2.0} # Does not sum to 1.0 - fused = reciprocal_rank_fusion(results, weights=weights) - assert len(fused) == 1 - # Result should still be valid after weight normalization - assert fused[0].score > 0 - - -# ============================================================================= -# Tests: simple_weighted_fusion -# ============================================================================= - - -class TestSimpleWeightedFusion: - """Tests for simple_weighted_fusion().""" - - def test_weighted_fusion_basic(self): - """Basic weighted fusion should combine scores.""" - results = { - "exact": [_make_result(path="a.py", score=10.0)], - "vector": [_make_result(path="a.py", score=0.8)], - } - weights = {"exact": 0.5, "vector": 0.5} - fused = simple_weighted_fusion(results, weights=weights) - assert len(fused) == 1 - assert fused[0].path == "a.py" - assert fused[0].metadata["fusion_method"] == "simple_weighted" - assert fused[0].score > 0 - - def test_weighted_fusion_empty(self): - """Empty input should return empty list.""" - assert simple_weighted_fusion({}) == [] - - -# ============================================================================= -# Tests: apply_symbol_boost -# ============================================================================= - - -class TestApplySymbolBoost: - """Tests for apply_symbol_boost().""" - - def test_symbol_boost_applied(self): - """Results with symbol_name should get boosted by factor.""" - results = [ - _make_result(path="a.py", score=0.5, symbol_name="authenticate"), - _make_result(path="b.py", score=0.6), - ] - boosted = apply_symbol_boost(results, boost_factor=1.5) - # a.py has symbol -> gets 1.5x boost -> 0.75 - a_result = next(r for r in boosted if r.path == "a.py") - assert a_result.score == pytest.approx(0.75, abs=0.01) - assert a_result.metadata.get("boosted") is True - - def test_symbol_boost_no_match(self): - """Results without symbol_name should not be boosted.""" - results = [ - _make_result(path="a.py", score=0.5), - ] - boosted = apply_symbol_boost(results, boost_factor=1.5) - assert boosted[0].score == pytest.approx(0.5, abs=0.01) - assert boosted[0].metadata.get("boosted") is not True - - -# ============================================================================= -# Tests: filter_results_by_category -# ============================================================================= - - -class TestFilterResultsByCategory: - """Tests for filter_results_by_category().""" - - def test_filter_keyword_code_only(self): - """KEYWORD intent should return only code files.""" - results = [ - _make_result(path="main.py", score=0.9), - _make_result(path="README.md", score=0.8), - _make_result(path="utils.ts", score=0.7), - ] - filtered = filter_results_by_category(results, QueryIntent.KEYWORD) - paths = [r.path for r in filtered] - assert "README.md" not in paths - assert "main.py" in paths - assert "utils.ts" in paths - - def test_filter_semantic_docs_first(self): - """SEMANTIC intent should put docs before code.""" - results = [ - _make_result(path="main.py", score=0.9), - _make_result(path="README.md", score=0.8), - ] - filtered = filter_results_by_category(results, QueryIntent.SEMANTIC, allow_mixed=True) - # Docs should come first - assert filtered[0].path == "README.md" - - -# ============================================================================= -# Tests: group_similar_results -# ============================================================================= - - -class TestGroupSimilarResults: - """Tests for group_similar_results().""" - - def test_group_similar_results(self): - """Results with same excerpt and close scores should be grouped.""" - results = [ - _make_result(path="a.py", score=0.50, excerpt="def foo():"), - _make_result(path="b.py", score=0.50, excerpt="def foo():"), - _make_result(path="c.py", score=0.30, excerpt="def bar():"), - ] - grouped = group_similar_results(results, score_threshold_abs=0.01) - # a.py and b.py should be grouped (same excerpt, same score) - assert len(grouped) == 2 - # Find the grouped result - grouped_result = next(r for r in grouped if r.path == "a.py") - assert len(grouped_result.additional_locations) == 1 - assert grouped_result.additional_locations[0].path == "b.py" - - -# ============================================================================= -# Tests: normalize_weights -# ============================================================================= - - -class TestNormalizeWeights: - """Tests for normalize_weights().""" - - def test_normalize_weights_zero_total(self): - """All-zero weights should be returned as-is (no division by zero).""" - weights = {"exact": 0.0, "fuzzy": 0.0, "vector": 0.0} - result = normalize_weights(weights) - assert result == {"exact": 0.0, "fuzzy": 0.0, "vector": 0.0} diff --git a/codex-lens/tests/test_recursive_splitting.py b/codex-lens/tests/test_recursive_splitting.py deleted file mode 100644 index 5a3297d3..00000000 --- a/codex-lens/tests/test_recursive_splitting.py +++ /dev/null @@ -1,291 +0,0 @@ -"""Tests for recursive splitting of large symbols in chunker.""" - -import pytest -from codexlens.entities import Symbol -from codexlens.semantic.chunker import Chunker, ChunkConfig - - -class TestRecursiveSplitting: - """Test cases for recursive splitting of large symbols.""" - - def test_small_symbol_no_split(self): - """Test that small symbols are not split.""" - config = ChunkConfig(max_chunk_size=1000, overlap=100) - chunker = Chunker(config) - - content = '''def small_function(): - # This is a small function - x = 1 - y = 2 - return x + y -''' - symbols = [Symbol(name='small_function', kind='function', range=(1, 5))] - - chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python') - - assert len(chunks) == 1 - assert chunks[0].metadata['strategy'] == 'symbol' - assert chunks[0].metadata['symbol_name'] == 'small_function' - assert chunks[0].metadata['symbol_kind'] == 'function' - assert 'parent_symbol_range' not in chunks[0].metadata - - def test_large_symbol_splits(self): - """Test that large symbols are recursively split.""" - config = ChunkConfig(max_chunk_size=100, overlap=20) - chunker = Chunker(config) - - content = '''def large_function(): - # Line 1 - # Line 2 - # Line 3 - # Line 4 - # Line 5 - # Line 6 - # Line 7 - # Line 8 - # Line 9 - # Line 10 - # Line 11 - # Line 12 - # Line 13 - # Line 14 - # Line 15 - pass -''' - symbols = [Symbol(name='large_function', kind='function', range=(1, 18))] - - chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python') - - # Should be split into multiple chunks - assert len(chunks) > 1 - - # All chunks should have symbol metadata - for chunk in chunks: - assert chunk.metadata['strategy'] == 'symbol_split' - assert chunk.metadata['symbol_name'] == 'large_function' - assert chunk.metadata['symbol_kind'] == 'function' - assert chunk.metadata['parent_symbol_range'] == (1, 18) - - def test_boundary_condition(self): - """Test symbol exactly at max_chunk_size boundary.""" - config = ChunkConfig(max_chunk_size=90, overlap=20) - chunker = Chunker(config) - - content = '''def boundary_function(): - # This function is exactly at boundary - x = 1 - y = 2 - return x + y -''' - symbols = [Symbol(name='boundary_function', kind='function', range=(1, 5))] - - chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python') - - # Content is slightly over 90 chars, should be split - assert len(chunks) >= 1 - assert chunks[0].metadata['strategy'] == 'symbol_split' - - def test_multiple_symbols_mixed_sizes(self): - """Test chunking with multiple symbols of different sizes.""" - config = ChunkConfig(max_chunk_size=150, overlap=30) - chunker = Chunker(config) - - content = '''def small(): - return 1 - -def medium(): - # Medium function - x = 1 - y = 2 - z = 3 - return x + y + z - -def very_large(): - # Line 1 - # Line 2 - # Line 3 - # Line 4 - # Line 5 - # Line 6 - # Line 7 - # Line 8 - # Line 9 - # Line 10 - # Line 11 - # Line 12 - # Line 13 - # Line 14 - # Line 15 - pass -''' - symbols = [ - Symbol(name='small', kind='function', range=(1, 2)), - Symbol(name='medium', kind='function', range=(4, 9)), - Symbol(name='very_large', kind='function', range=(11, 28)), - ] - - chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python') - - # Find chunks for each symbol - small_chunks = [c for c in chunks if c.metadata['symbol_name'] == 'small'] - medium_chunks = [c for c in chunks if c.metadata['symbol_name'] == 'medium'] - large_chunks = [c for c in chunks if c.metadata['symbol_name'] == 'very_large'] - - # Small should be filtered (< min_chunk_size) - assert len(small_chunks) == 0 - - # Medium should not be split - assert len(medium_chunks) == 1 - assert medium_chunks[0].metadata['strategy'] == 'symbol' - - # Large should be split - assert len(large_chunks) > 1 - for chunk in large_chunks: - assert chunk.metadata['strategy'] == 'symbol_split' - - def test_line_numbers_preserved(self): - """Test that line numbers are correctly preserved in sub-chunks.""" - config = ChunkConfig(max_chunk_size=100, overlap=20) - chunker = Chunker(config) - - content = '''def large_function(): - # Line 1 with some extra content to make it longer - # Line 2 with some extra content to make it longer - # Line 3 with some extra content to make it longer - # Line 4 with some extra content to make it longer - # Line 5 with some extra content to make it longer - # Line 6 with some extra content to make it longer - # Line 7 with some extra content to make it longer - # Line 8 with some extra content to make it longer - # Line 9 with some extra content to make it longer - # Line 10 with some extra content to make it longer - pass -''' - symbols = [Symbol(name='large_function', kind='function', range=(1, 13))] - - chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python') - - # Verify line numbers are correct and sequential - assert len(chunks) > 1 - assert chunks[0].metadata['start_line'] == 1 - - # Each chunk should have valid line numbers - for chunk in chunks: - assert chunk.metadata['start_line'] >= 1 - assert chunk.metadata['end_line'] <= 13 - assert chunk.metadata['start_line'] <= chunk.metadata['end_line'] - - def test_overlap_in_split_chunks(self): - """Test that overlap is applied when splitting large symbols.""" - config = ChunkConfig(max_chunk_size=100, overlap=30) - chunker = Chunker(config) - - content = '''def large_function(): - # Line 1 - # Line 2 - # Line 3 - # Line 4 - # Line 5 - # Line 6 - # Line 7 - # Line 8 - # Line 9 - # Line 10 - # Line 11 - # Line 12 - pass -''' - symbols = [Symbol(name='large_function', kind='function', range=(1, 14))] - - chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python') - - # With overlap, consecutive chunks should overlap - if len(chunks) > 1: - for i in range(len(chunks) - 1): - # Next chunk should start before current chunk ends (overlap) - current_end = chunks[i].metadata['end_line'] - next_start = chunks[i + 1].metadata['start_line'] - # Overlap should exist - assert next_start <= current_end - - def test_empty_symbol_filtered(self): - """Test that symbols smaller than min_chunk_size are filtered.""" - config = ChunkConfig(max_chunk_size=1000, min_chunk_size=50) - chunker = Chunker(config) - - content = '''def tiny(): - pass -''' - symbols = [Symbol(name='tiny', kind='function', range=(1, 2))] - - chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python') - - # Should be filtered due to min_chunk_size - assert len(chunks) == 0 - - def test_class_symbol_splits(self): - """Test that large class symbols are also split correctly.""" - config = ChunkConfig(max_chunk_size=120, overlap=25) - chunker = Chunker(config) - - content = '''class LargeClass: - """A large class with many methods.""" - - def method1(self): - return 1 - - def method2(self): - return 2 - - def method3(self): - return 3 - - def method4(self): - return 4 -''' - symbols = [Symbol(name='LargeClass', kind='class', range=(1, 14))] - - chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python') - - # Should be split - assert len(chunks) > 1 - - # All chunks should preserve class metadata - for chunk in chunks: - assert chunk.metadata['symbol_name'] == 'LargeClass' - assert chunk.metadata['symbol_kind'] == 'class' - assert chunk.metadata['strategy'] == 'symbol_split' - - -class TestLightweightMode: - """Test recursive splitting with lightweight token counting.""" - - def test_large_symbol_splits_lightweight_mode(self): - """Test that large symbols split correctly in lightweight mode.""" - config = ChunkConfig(max_chunk_size=100, overlap=20, skip_token_count=True) - chunker = Chunker(config) - - content = '''def large_function(): - # Line 1 with some extra content to make it longer - # Line 2 with some extra content to make it longer - # Line 3 with some extra content to make it longer - # Line 4 with some extra content to make it longer - # Line 5 with some extra content to make it longer - # Line 6 with some extra content to make it longer - # Line 7 with some extra content to make it longer - # Line 8 with some extra content to make it longer - # Line 9 with some extra content to make it longer - # Line 10 with some extra content to make it longer - pass -''' - symbols = [Symbol(name='large_function', kind='function', range=(1, 13))] - - chunks = chunker.chunk_by_symbol(content, symbols, 'test.py', 'python') - - # Should split even in lightweight mode - assert len(chunks) > 1 - - # All chunks should have token_count (estimated) - for chunk in chunks: - assert 'token_count' in chunk.metadata - assert chunk.metadata['token_count'] > 0 diff --git a/codex-lens/tests/test_registry.py b/codex-lens/tests/test_registry.py deleted file mode 100644 index b610140a..00000000 --- a/codex-lens/tests/test_registry.py +++ /dev/null @@ -1,126 +0,0 @@ -"""Tests for RegistryStore path handling.""" - -from __future__ import annotations - -from pathlib import Path - -import pytest - -from codexlens.storage.registry import RegistryStore - - -def _swap_case(path: Path) -> str: - return str(path).swapcase() - - -def test_path_case_normalization_windows(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """On Windows, path comparisons should be case-insensitive.""" - import codexlens.storage.registry as registry - - monkeypatch.setattr(registry.platform, "system", lambda: "Windows") - - db_path = tmp_path / "registry.db" - source_root = tmp_path / "MyProject" - index_root = tmp_path / "indexes" - - with RegistryStore(db_path=db_path) as store: - store.register_project(source_root, index_root) - - result = store.find_by_source_path(_swap_case(source_root)) - assert result is not None - assert result["source_root"] == str(source_root.resolve()).lower() - - -def test_path_case_sensitivity_non_windows(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """On Unix, path comparisons should remain case-sensitive.""" - import codexlens.storage.registry as registry - - monkeypatch.setattr(registry.platform, "system", lambda: "Linux") - - db_path = tmp_path / "registry.db" - source_root = tmp_path / "MyProject" - index_root = tmp_path / "indexes" - - with RegistryStore(db_path=db_path) as store: - store.register_project(source_root, index_root) - assert store.find_by_source_path(_swap_case(source_root)) is None - - -def test_find_nearest_index(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """Nearest ancestor lookup should be case-insensitive on Windows.""" - import codexlens.storage.registry as registry - - monkeypatch.setattr(registry.platform, "system", lambda: "Windows") - - db_path = tmp_path / "registry.db" - source_root = tmp_path / "MyProject" - index_root = tmp_path / "indexes" - index_db = index_root / "_index.db" - - with RegistryStore(db_path=db_path) as store: - project = store.register_project(source_root, index_root) - mapping = store.register_dir(project.id, source_root, index_db, depth=0) - - query_path = Path(_swap_case(source_root)) / "SubDir" / "file.py" - found = store.find_nearest_index(query_path) - - assert found is not None - assert found.id == mapping.id - - -def test_find_descendant_project_roots_returns_nested_project_roots(tmp_path: Path) -> None: - db_path = tmp_path / "registry.db" - workspace_root = tmp_path / "workspace" - child_a = workspace_root / "packages" / "app-a" - child_b = workspace_root / "tools" / "app-b" - outside_root = tmp_path / "external" - - with RegistryStore(db_path=db_path) as store: - workspace_project = store.register_project( - workspace_root, - tmp_path / "indexes" / "workspace", - ) - child_a_project = store.register_project( - child_a, - tmp_path / "indexes" / "workspace" / "packages" / "app-a", - ) - child_b_project = store.register_project( - child_b, - tmp_path / "indexes" / "workspace" / "tools" / "app-b", - ) - outside_project = store.register_project( - outside_root, - tmp_path / "indexes" / "external", - ) - - store.register_dir( - workspace_project.id, - workspace_root, - tmp_path / "indexes" / "workspace" / "_index.db", - depth=0, - ) - child_a_mapping = store.register_dir( - child_a_project.id, - child_a, - tmp_path / "indexes" / "workspace" / "packages" / "app-a" / "_index.db", - depth=0, - ) - child_b_mapping = store.register_dir( - child_b_project.id, - child_b, - tmp_path / "indexes" / "workspace" / "tools" / "app-b" / "_index.db", - depth=0, - ) - store.register_dir( - outside_project.id, - outside_root, - tmp_path / "indexes" / "external" / "_index.db", - depth=0, - ) - - descendants = store.find_descendant_project_roots(workspace_root) - - assert [mapping.index_path for mapping in descendants] == [ - child_a_mapping.index_path, - child_b_mapping.index_path, - ] diff --git a/codex-lens/tests/test_reranker_backends.py b/codex-lens/tests/test_reranker_backends.py deleted file mode 100644 index 439631ef..00000000 --- a/codex-lens/tests/test_reranker_backends.py +++ /dev/null @@ -1,115 +0,0 @@ -"""Mocked smoke tests for all reranker backends.""" - -from __future__ import annotations - -import sys -import types -from dataclasses import dataclass - -import pytest - - -def test_reranker_backend_legacy_scores_pairs(monkeypatch: pytest.MonkeyPatch) -> None: - from codexlens.semantic.reranker import legacy as legacy_module - - class DummyCrossEncoder: - def __init__(self, model_name: str, *, device: str | None = None) -> None: - self.model_name = model_name - self.device = device - self.calls: list[dict[str, object]] = [] - - def predict(self, pairs: list[tuple[str, str]], *, batch_size: int = 32) -> list[float]: - self.calls.append({"pairs": list(pairs), "batch_size": int(batch_size)}) - return [0.5 for _ in pairs] - - monkeypatch.setattr(legacy_module, "_CrossEncoder", DummyCrossEncoder) - monkeypatch.setattr(legacy_module, "CROSS_ENCODER_AVAILABLE", True) - monkeypatch.setattr(legacy_module, "_import_error", None) - - reranker = legacy_module.CrossEncoderReranker(model_name="dummy-model", device="cpu") - scores = reranker.score_pairs([("q", "d1"), ("q", "d2")], batch_size=0) - assert scores == pytest.approx([0.5, 0.5]) - - -def test_reranker_backend_onnx_availability_check(monkeypatch: pytest.MonkeyPatch) -> None: - from codexlens.semantic.reranker.onnx_reranker import check_onnx_reranker_available - - dummy_numpy = types.ModuleType("numpy") - dummy_onnxruntime = types.ModuleType("onnxruntime") - - dummy_optimum = types.ModuleType("optimum") - dummy_optimum.__path__ = [] # Mark as package for submodule imports. - dummy_optimum_ort = types.ModuleType("optimum.onnxruntime") - dummy_optimum_ort.ORTModelForSequenceClassification = object() - - dummy_transformers = types.ModuleType("transformers") - dummy_transformers.AutoTokenizer = object() - - monkeypatch.setitem(sys.modules, "numpy", dummy_numpy) - monkeypatch.setitem(sys.modules, "onnxruntime", dummy_onnxruntime) - monkeypatch.setitem(sys.modules, "optimum", dummy_optimum) - monkeypatch.setitem(sys.modules, "optimum.onnxruntime", dummy_optimum_ort) - monkeypatch.setitem(sys.modules, "transformers", dummy_transformers) - - ok, err = check_onnx_reranker_available() - assert ok is True - assert err is None - - -def test_reranker_backend_api_constructs_with_dummy_httpx(monkeypatch: pytest.MonkeyPatch) -> None: - from codexlens.semantic.reranker.api_reranker import APIReranker - - created: list[object] = [] - - class DummyClient: - def __init__( - self, - *, - base_url: str | None = None, - headers: dict[str, str] | None = None, - timeout: float | None = None, - ) -> None: - self.base_url = base_url - self.headers = headers or {} - self.timeout = timeout - self.closed = False - created.append(self) - - def close(self) -> None: - self.closed = True - - dummy_httpx = types.ModuleType("httpx") - dummy_httpx.Client = DummyClient - monkeypatch.setitem(sys.modules, "httpx", dummy_httpx) - - reranker = APIReranker(api_key="k", provider="siliconflow") - assert reranker.provider == "siliconflow" - assert len(created) == 1 - assert created[0].headers["Authorization"] == "Bearer k" - reranker.close() - assert created[0].closed is True - - -def test_reranker_backend_litellm_scores_pairs(monkeypatch: pytest.MonkeyPatch) -> None: - from codexlens.semantic.reranker.litellm_reranker import LiteLLMReranker - - @dataclass(frozen=True, slots=True) - class ChatMessage: - role: str - content: str - - class DummyLiteLLMClient: - def __init__(self, model: str = "default", **_kwargs: object) -> None: - self.model = model - - def chat(self, _messages: list[ChatMessage]) -> object: - return types.SimpleNamespace(content="0.5") - - dummy_litellm = types.ModuleType("ccw_litellm") - dummy_litellm.ChatMessage = ChatMessage - dummy_litellm.LiteLLMClient = DummyLiteLLMClient - monkeypatch.setitem(sys.modules, "ccw_litellm", dummy_litellm) - - reranker = LiteLLMReranker(model="dummy") - assert reranker.score_pairs([("q", "d")]) == pytest.approx([0.5]) - diff --git a/codex-lens/tests/test_reranker_factory.py b/codex-lens/tests/test_reranker_factory.py deleted file mode 100644 index 62647d1d..00000000 --- a/codex-lens/tests/test_reranker_factory.py +++ /dev/null @@ -1,401 +0,0 @@ -"""Tests for reranker factory and availability checks.""" - -from __future__ import annotations - -import builtins -import math -import sys -import types - -import pytest - -from codexlens.semantic.reranker import ( - BaseReranker, - ONNXReranker, - check_reranker_available, - get_reranker, -) -from codexlens.semantic.reranker import legacy as legacy_module - - -def test_public_imports_work() -> None: - from codexlens.semantic.reranker import BaseReranker as ImportedBaseReranker - from codexlens.semantic.reranker import get_reranker as imported_get_reranker - - assert ImportedBaseReranker is BaseReranker - assert imported_get_reranker is get_reranker - - -def test_base_reranker_is_abstract() -> None: - with pytest.raises(TypeError): - BaseReranker() # type: ignore[abstract] - - -def test_check_reranker_available_invalid_backend() -> None: - ok, err = check_reranker_available("nope") - assert ok is False - assert "Invalid reranker backend" in (err or "") - - -def test_get_reranker_invalid_backend_raises_value_error() -> None: - with pytest.raises(ValueError, match="Unknown backend"): - get_reranker("nope") - - -def test_get_reranker_legacy_missing_dependency_raises_import_error( - monkeypatch: pytest.MonkeyPatch, -) -> None: - monkeypatch.setattr(legacy_module, "CROSS_ENCODER_AVAILABLE", False) - monkeypatch.setattr(legacy_module, "_import_error", "missing sentence-transformers") - - with pytest.raises(ImportError, match="missing sentence-transformers"): - get_reranker(backend="legacy", model_name="dummy-model") - - -def test_get_reranker_legacy_returns_cross_encoder_reranker( - monkeypatch: pytest.MonkeyPatch, -) -> None: - class DummyCrossEncoder: - def __init__(self, model_name: str, *, device: str | None = None) -> None: - self.model_name = model_name - self.device = device - self.last_batch_size: int | None = None - - def predict(self, pairs: list[tuple[str, str]], *, batch_size: int = 32) -> list[float]: - self.last_batch_size = int(batch_size) - return [0.5 for _ in pairs] - - monkeypatch.setattr(legacy_module, "_CrossEncoder", DummyCrossEncoder) - monkeypatch.setattr(legacy_module, "CROSS_ENCODER_AVAILABLE", True) - monkeypatch.setattr(legacy_module, "_import_error", None) - - reranker = get_reranker(backend=" LEGACY ", model_name="dummy-model", device="cpu") - assert isinstance(reranker, legacy_module.CrossEncoderReranker) - - assert reranker.score_pairs([]) == [] - - scores = reranker.score_pairs([("q", "d1"), ("q", "d2")], batch_size=0) - assert scores == pytest.approx([0.5, 0.5]) - assert reranker._model is not None - assert reranker._model.last_batch_size == 32 - - -def test_check_reranker_available_onnx_missing_deps(monkeypatch: pytest.MonkeyPatch) -> None: - real_import = builtins.__import__ - - def fake_import(name: str, globals=None, locals=None, fromlist=(), level: int = 0): - if name == "onnxruntime": - raise ImportError("no onnxruntime") - return real_import(name, globals, locals, fromlist, level) - - monkeypatch.setattr(builtins, "__import__", fake_import) - - ok, err = check_reranker_available("onnx") - assert ok is False - assert "onnxruntime not available" in (err or "") - - -def test_check_reranker_available_onnx_deps_present(monkeypatch: pytest.MonkeyPatch) -> None: - dummy_onnxruntime = types.ModuleType("onnxruntime") - dummy_optimum = types.ModuleType("optimum") - dummy_optimum.__path__ = [] # Mark as package for submodule imports. - dummy_optimum_ort = types.ModuleType("optimum.onnxruntime") - dummy_optimum_ort.ORTModelForSequenceClassification = object() - - dummy_transformers = types.ModuleType("transformers") - dummy_transformers.AutoTokenizer = object() - - monkeypatch.setitem(sys.modules, "onnxruntime", dummy_onnxruntime) - monkeypatch.setitem(sys.modules, "optimum", dummy_optimum) - monkeypatch.setitem(sys.modules, "optimum.onnxruntime", dummy_optimum_ort) - monkeypatch.setitem(sys.modules, "transformers", dummy_transformers) - - ok, err = check_reranker_available("onnx") - assert ok is True - assert err is None - - -def test_check_reranker_available_litellm_missing_deps(monkeypatch: pytest.MonkeyPatch) -> None: - real_import = builtins.__import__ - - def fake_import(name: str, globals=None, locals=None, fromlist=(), level: int = 0): - if name == "ccw_litellm": - raise ImportError("no ccw-litellm") - return real_import(name, globals, locals, fromlist, level) - - monkeypatch.setattr(builtins, "__import__", fake_import) - - ok, err = check_reranker_available("litellm") - assert ok is False - assert "ccw-litellm not available" in (err or "") - - -def test_check_reranker_available_litellm_deps_present( - monkeypatch: pytest.MonkeyPatch, -) -> None: - dummy_litellm = types.ModuleType("ccw_litellm") - monkeypatch.setitem(sys.modules, "ccw_litellm", dummy_litellm) - - ok, err = check_reranker_available("litellm") - assert ok is True - assert err is None - - -def test_check_reranker_available_api_missing_deps(monkeypatch: pytest.MonkeyPatch) -> None: - real_import = builtins.__import__ - - def fake_import(name: str, globals=None, locals=None, fromlist=(), level: int = 0): - if name == "httpx": - raise ImportError("no httpx") - return real_import(name, globals, locals, fromlist, level) - - monkeypatch.setattr(builtins, "__import__", fake_import) - - ok, err = check_reranker_available("api") - assert ok is False - assert "httpx not available" in (err or "") - - -def test_check_reranker_available_api_deps_present(monkeypatch: pytest.MonkeyPatch) -> None: - dummy_httpx = types.ModuleType("httpx") - monkeypatch.setitem(sys.modules, "httpx", dummy_httpx) - - ok, err = check_reranker_available("api") - assert ok is True - assert err is None - - -def test_get_reranker_litellm_returns_litellm_reranker( - monkeypatch: pytest.MonkeyPatch, -) -> None: - from dataclasses import dataclass - - @dataclass(frozen=True, slots=True) - class ChatMessage: - role: str - content: str - - class DummyLiteLLMClient: - def __init__(self, model: str = "default", **kwargs) -> None: - self.model = model - self.kwargs = kwargs - - def chat(self, messages, **kwargs): - return types.SimpleNamespace(content="0.5") - - dummy_litellm = types.ModuleType("ccw_litellm") - dummy_litellm.ChatMessage = ChatMessage - dummy_litellm.LiteLLMClient = DummyLiteLLMClient - monkeypatch.setitem(sys.modules, "ccw_litellm", dummy_litellm) - - reranker = get_reranker(backend="litellm", model_name="dummy-model") - - from codexlens.semantic.reranker.litellm_reranker import LiteLLMReranker - - assert isinstance(reranker, LiteLLMReranker) - assert reranker.score_pairs([("q", "d")]) == pytest.approx([0.5]) - - -def test_get_reranker_onnx_raises_import_error_with_dependency_hint( - monkeypatch: pytest.MonkeyPatch, -) -> None: - real_import = builtins.__import__ - - def fake_import(name: str, globals=None, locals=None, fromlist=(), level: int = 0): - if name == "onnxruntime": - raise ImportError("no onnxruntime") - return real_import(name, globals, locals, fromlist, level) - - monkeypatch.setattr(builtins, "__import__", fake_import) - - with pytest.raises(ImportError) as exc: - get_reranker(backend="onnx", model_name="any") - - assert "onnxruntime" in str(exc.value) - - -def test_get_reranker_default_backend_is_onnx(monkeypatch: pytest.MonkeyPatch) -> None: - dummy_onnxruntime = types.ModuleType("onnxruntime") - dummy_optimum = types.ModuleType("optimum") - dummy_optimum.__path__ = [] # Mark as package for submodule imports. - dummy_optimum_ort = types.ModuleType("optimum.onnxruntime") - dummy_optimum_ort.ORTModelForSequenceClassification = object() - - dummy_transformers = types.ModuleType("transformers") - dummy_transformers.AutoTokenizer = object() - - monkeypatch.setitem(sys.modules, "onnxruntime", dummy_onnxruntime) - monkeypatch.setitem(sys.modules, "optimum", dummy_optimum) - monkeypatch.setitem(sys.modules, "optimum.onnxruntime", dummy_optimum_ort) - monkeypatch.setitem(sys.modules, "transformers", dummy_transformers) - - reranker = get_reranker() - assert isinstance(reranker, ONNXReranker) - - -def test_onnx_reranker_scores_pairs_with_sigmoid_normalization( - monkeypatch: pytest.MonkeyPatch, -) -> None: - import numpy as np - - dummy_onnxruntime = types.ModuleType("onnxruntime") - - dummy_optimum = types.ModuleType("optimum") - dummy_optimum.__path__ = [] # Mark as package for submodule imports. - dummy_optimum_ort = types.ModuleType("optimum.onnxruntime") - - class DummyModelOutput: - def __init__(self, logits: np.ndarray) -> None: - self.logits = logits - - class DummyModel: - input_names = ["input_ids", "attention_mask"] - - def __init__(self) -> None: - self.calls: list[int] = [] - self._next_logit = 0 - - def __call__(self, **inputs): - batch = int(inputs["input_ids"].shape[0]) - start = self._next_logit - self._next_logit += batch - self.calls.append(batch) - logits = np.arange(start, start + batch, dtype=np.float32).reshape(batch, 1) - return DummyModelOutput(logits=logits) - - class DummyORTModelForSequenceClassification: - @classmethod - def from_pretrained(cls, model_name: str, providers=None, **kwargs): - _ = model_name, providers, kwargs - return DummyModel() - - dummy_optimum_ort.ORTModelForSequenceClassification = DummyORTModelForSequenceClassification - - dummy_transformers = types.ModuleType("transformers") - - class DummyAutoTokenizer: - model_max_length = 512 - - @classmethod - def from_pretrained(cls, model_name: str, **kwargs): - _ = model_name, kwargs - return cls() - - def __call__(self, *, text, text_pair, return_tensors, **kwargs): - _ = text_pair, kwargs - assert return_tensors == "np" - batch = len(text) - # Include token_type_ids to ensure input filtering is exercised. - return { - "input_ids": np.zeros((batch, 4), dtype=np.int64), - "attention_mask": np.ones((batch, 4), dtype=np.int64), - "token_type_ids": np.zeros((batch, 4), dtype=np.int64), - } - - dummy_transformers.AutoTokenizer = DummyAutoTokenizer - - monkeypatch.setitem(sys.modules, "onnxruntime", dummy_onnxruntime) - monkeypatch.setitem(sys.modules, "optimum", dummy_optimum) - monkeypatch.setitem(sys.modules, "optimum.onnxruntime", dummy_optimum_ort) - monkeypatch.setitem(sys.modules, "transformers", dummy_transformers) - - reranker = get_reranker(backend="onnx", model_name="dummy-model", use_gpu=False) - assert isinstance(reranker, ONNXReranker) - assert reranker._model is None - - pairs = [("q", f"d{idx}") for idx in range(5)] - scores = reranker.score_pairs(pairs, batch_size=2) - - assert reranker._model is not None - assert reranker._model.calls == [2, 2, 1] - assert len(scores) == len(pairs) - assert all(0.0 <= s <= 1.0 for s in scores) - - expected = [1.0 / (1.0 + math.exp(-float(i))) for i in range(len(pairs))] - assert scores == pytest.approx(expected, rel=1e-6, abs=1e-6) - - -def test_onnx_reranker_splits_tuple_providers_into_provider_options( - monkeypatch: pytest.MonkeyPatch, -) -> None: - import numpy as np - - captured: dict[str, object] = {} - - dummy_onnxruntime = types.ModuleType("onnxruntime") - - dummy_optimum = types.ModuleType("optimum") - dummy_optimum.__path__ = [] - dummy_optimum_ort = types.ModuleType("optimum.onnxruntime") - - class DummyModelOutput: - def __init__(self, logits: np.ndarray) -> None: - self.logits = logits - - class DummyModel: - input_names = ["input_ids", "attention_mask"] - - def __call__(self, **inputs): - batch = int(inputs["input_ids"].shape[0]) - return DummyModelOutput(logits=np.zeros((batch, 1), dtype=np.float32)) - - class DummyORTModelForSequenceClassification: - @classmethod - def from_pretrained( - cls, - model_name: str, - providers=None, - provider_options=None, - **kwargs, - ): - captured["model_name"] = model_name - captured["providers"] = providers - captured["provider_options"] = provider_options - captured["kwargs"] = kwargs - return DummyModel() - - dummy_optimum_ort.ORTModelForSequenceClassification = DummyORTModelForSequenceClassification - - dummy_transformers = types.ModuleType("transformers") - - class DummyAutoTokenizer: - model_max_length = 512 - - @classmethod - def from_pretrained(cls, model_name: str, **kwargs): - _ = model_name, kwargs - return cls() - - def __call__(self, *, text, text_pair, return_tensors, **kwargs): - _ = text_pair, kwargs - assert return_tensors == "np" - batch = len(text) - return { - "input_ids": np.zeros((batch, 4), dtype=np.int64), - "attention_mask": np.ones((batch, 4), dtype=np.int64), - } - - dummy_transformers.AutoTokenizer = DummyAutoTokenizer - - monkeypatch.setitem(sys.modules, "onnxruntime", dummy_onnxruntime) - monkeypatch.setitem(sys.modules, "optimum", dummy_optimum) - monkeypatch.setitem(sys.modules, "optimum.onnxruntime", dummy_optimum_ort) - monkeypatch.setitem(sys.modules, "transformers", dummy_transformers) - - reranker = get_reranker( - backend="onnx", - model_name="dummy-model", - use_gpu=True, - providers=[ - ("DmlExecutionProvider", {"device_id": 1}), - "CPUExecutionProvider", - ], - ) - assert isinstance(reranker, ONNXReranker) - - scores = reranker.score_pairs([("q", "d")], batch_size=1) - - assert scores == pytest.approx([0.5]) - assert captured["model_name"] == "dummy-model" - assert captured["providers"] == ["DmlExecutionProvider", "CPUExecutionProvider"] - assert captured["provider_options"] == [{"device_id": 1}, {}] diff --git a/codex-lens/tests/test_result_grouping.py b/codex-lens/tests/test_result_grouping.py deleted file mode 100644 index ee2720d2..00000000 --- a/codex-lens/tests/test_result_grouping.py +++ /dev/null @@ -1,589 +0,0 @@ -"""Multi-level tests for search result grouping functionality. - -Tests cover: -1. Unit tests for group_similar_results function -2. Boundary condition tests -3. Integration tests with SearchOptions -4. Performance/stress tests -""" - -import pytest -from typing import List - -from codexlens.entities import SearchResult, AdditionalLocation -from codexlens.search.ranking import group_similar_results -from codexlens.search.chain_search import SearchOptions - - -# ============================================================================= -# Test Fixtures -# ============================================================================= - -@pytest.fixture -def sample_results() -> List[SearchResult]: - """Create sample search results for testing.""" - return [ - SearchResult(path="a.py", score=0.5, excerpt="def foo(): pass", start_line=10, symbol_name="foo"), - SearchResult(path="b.py", score=0.5, excerpt="def foo(): pass", start_line=20, symbol_name="foo"), - SearchResult(path="c.py", score=0.49, excerpt="def foo(): pass", start_line=30, symbol_name="foo"), - SearchResult(path="d.py", score=0.3, excerpt="def bar(): pass", start_line=40, symbol_name="bar"), - ] - - -@pytest.fixture -def results_with_different_excerpts() -> List[SearchResult]: - """Results with same scores but different content.""" - return [ - SearchResult(path="a.py", score=0.5, excerpt="def foo(): pass"), - SearchResult(path="b.py", score=0.5, excerpt="def bar(): pass"), - SearchResult(path="c.py", score=0.5, excerpt="def baz(): pass"), - ] - - -@pytest.fixture -def results_with_same_excerpt_different_scores() -> List[SearchResult]: - """Results with same content but very different scores.""" - return [ - SearchResult(path="a.py", score=0.9, excerpt="def foo(): pass"), - SearchResult(path="b.py", score=0.5, excerpt="def foo(): pass"), - SearchResult(path="c.py", score=0.1, excerpt="def foo(): pass"), - ] - - -# ============================================================================= -# Level 1: Unit Tests - Basic Functionality -# ============================================================================= - -class TestGroupSimilarResultsBasic: - """Basic unit tests for group_similar_results function.""" - - def test_empty_results_returns_empty(self): - """Empty input should return empty output.""" - result = group_similar_results([]) - assert result == [] - - def test_single_result_returns_unchanged(self): - """Single result should be returned as-is.""" - single = SearchResult(path="test.py", score=0.5, excerpt="code") - result = group_similar_results([single]) - - assert len(result) == 1 - assert result[0].path == "test.py" - assert result[0].additional_locations == [] - - def test_groups_identical_excerpt_similar_score(self, sample_results): - """Results with same excerpt and similar scores should be grouped.""" - grouped = group_similar_results(sample_results, score_threshold_abs=0.02) - - # Should have 2 groups: foo group (a, b, c) and bar (d) - assert len(grouped) == 2 - - # First group should have additional locations - foo_group = next(r for r in grouped if r.excerpt == "def foo(): pass") - assert len(foo_group.additional_locations) == 2 - - # Second group (bar) should have no additional locations - bar_group = next(r for r in grouped if r.excerpt == "def bar(): pass") - assert len(bar_group.additional_locations) == 0 - - def test_preserves_highest_score_as_representative(self, sample_results): - """Representative result should have the highest score in group.""" - grouped = group_similar_results(sample_results, score_threshold_abs=0.02) - - foo_group = next(r for r in grouped if r.excerpt == "def foo(): pass") - # a.py has score 0.5, which is highest - assert foo_group.path == "a.py" - assert foo_group.score == 0.5 - - def test_additional_locations_contain_correct_info(self, sample_results): - """Additional locations should contain correct path, score, line info.""" - grouped = group_similar_results(sample_results, score_threshold_abs=0.02) - - foo_group = next(r for r in grouped if r.excerpt == "def foo(): pass") - locations = foo_group.additional_locations - - paths = {loc.path for loc in locations} - assert "b.py" in paths - assert "c.py" in paths - - # Check that start_line is preserved - for loc in locations: - if loc.path == "b.py": - assert loc.start_line == 20 - elif loc.path == "c.py": - assert loc.start_line == 30 - - -# ============================================================================= -# Level 2: Boundary Condition Tests -# ============================================================================= - -class TestGroupSimilarResultsBoundary: - """Boundary condition tests for edge cases.""" - - def test_threshold_zero_no_grouping(self): - """With threshold=0, only exactly equal scores should group.""" - results = [ - SearchResult(path="a.py", score=0.5, excerpt="def foo()"), - SearchResult(path="b.py", score=0.5, excerpt="def foo()"), - SearchResult(path="c.py", score=0.50001, excerpt="def foo()"), # Slightly different - ] - - grouped = group_similar_results(results, score_threshold_abs=0.0) - - # a and b should group (exact same score), c should be separate - assert len(grouped) == 2 - - main_group = next(r for r in grouped if len(r.additional_locations) > 0) - assert len(main_group.additional_locations) == 1 - - def test_threshold_exact_boundary(self): - """Test behavior at exact threshold boundary. - - Note: Due to floating-point precision, 0.5 - 0.49 = 0.010000000000000009 - which is slightly > 0.01, so they won't group with threshold=0.01. - Use a slightly larger threshold to account for floating-point precision. - """ - results = [ - SearchResult(path="a.py", score=0.5, excerpt="def foo()"), - SearchResult(path="b.py", score=0.49, excerpt="def foo()"), # 0.01 diff (floating-point) - SearchResult(path="c.py", score=0.48, excerpt="def foo()"), # 0.02 diff from a - ] - - # With threshold 0.011 (slightly above floating-point 0.01), a and b should group - grouped = group_similar_results(results, score_threshold_abs=0.011) - - # a groups with b, c is separate (0.02 from a, 0.01 from b) - # After a+b group, c is compared with remaining and forms its own group - assert len(grouped) == 2 - - # Verify a is representative (highest score) - main_group = next(r for r in grouped if r.score == 0.5) - assert main_group.path == "a.py" - assert len(main_group.additional_locations) == 1 - assert main_group.additional_locations[0].path == "b.py" - - def test_large_threshold_groups_all(self): - """Very large threshold should group all same-content results.""" - results = [ - SearchResult(path="a.py", score=0.9, excerpt="def foo()"), - SearchResult(path="b.py", score=0.1, excerpt="def foo()"), - ] - - grouped = group_similar_results(results, score_threshold_abs=1.0) - - assert len(grouped) == 1 - assert len(grouped[0].additional_locations) == 1 - - def test_none_excerpt_not_grouped(self): - """Results with None excerpt should not be grouped.""" - results = [ - SearchResult(path="a.py", score=0.5, excerpt=None), - SearchResult(path="b.py", score=0.5, excerpt=None), - ] - - grouped = group_similar_results(results) - - # None excerpts can't be grouped by content - assert len(grouped) == 2 - for r in grouped: - assert len(r.additional_locations) == 0 - - def test_empty_excerpt_not_grouped(self): - """Results with empty string excerpt should not be grouped.""" - results = [ - SearchResult(path="a.py", score=0.5, excerpt=""), - SearchResult(path="b.py", score=0.5, excerpt=""), - SearchResult(path="c.py", score=0.5, excerpt=" "), # Whitespace only - ] - - grouped = group_similar_results(results) - - # Empty/whitespace excerpts can't be grouped - assert len(grouped) == 3 - - def test_different_excerpts_not_grouped(self, results_with_different_excerpts): - """Results with different excerpts should not be grouped even with same score.""" - grouped = group_similar_results(results_with_different_excerpts, score_threshold_abs=1.0) - - # Different content = no grouping - assert len(grouped) == 3 - for r in grouped: - assert len(r.additional_locations) == 0 - - def test_same_excerpt_different_scores_creates_subgroups(self, results_with_same_excerpt_different_scores): - """Same content but very different scores should create separate subgroups.""" - grouped = group_similar_results( - results_with_same_excerpt_different_scores, - score_threshold_abs=0.1 - ) - - # Scores 0.9, 0.5, 0.1 with threshold 0.1 - # 0.9 and 0.5 differ by 0.4 > 0.1, so separate - # 0.5 and 0.1 differ by 0.4 > 0.1, so separate - assert len(grouped) == 3 - - -# ============================================================================= -# Level 3: Content Field Tests -# ============================================================================= - -class TestGroupSimilarResultsContentField: - """Tests for different content_field options.""" - - def test_group_by_content_field(self): - """Should be able to group by 'content' field instead of 'excerpt'.""" - results = [ - SearchResult(path="a.py", score=0.5, excerpt="short", content="full content here"), - SearchResult(path="b.py", score=0.5, excerpt="different", content="full content here"), - ] - - # Group by excerpt - different excerpts, no grouping - grouped_by_excerpt = group_similar_results(results, content_field="excerpt") - assert len(grouped_by_excerpt) == 2 - - # Group by content - same content, should group - grouped_by_content = group_similar_results(results, content_field="content") - assert len(grouped_by_content) == 1 - assert len(grouped_by_content[0].additional_locations) == 1 - - def test_fallback_when_content_field_missing(self): - """Results without the specified content field should not be grouped.""" - results = [ - SearchResult(path="a.py", score=0.5, content=None), - SearchResult(path="b.py", score=0.5, content=None), - ] - - grouped = group_similar_results(results, content_field="content") - - # None content = ungroupable - assert len(grouped) == 2 - - -# ============================================================================= -# Level 4: Metadata and Ordering Tests -# ============================================================================= - -class TestGroupSimilarResultsMetadata: - """Tests for metadata handling and result ordering.""" - - def test_grouped_count_in_metadata(self, sample_results): - """Grouped results should have grouped_count in metadata.""" - grouped = group_similar_results(sample_results, score_threshold_abs=0.02) - - foo_group = next(r for r in grouped if r.excerpt == "def foo(): pass") - - assert "grouped_count" in foo_group.metadata - assert foo_group.metadata["grouped_count"] == 3 # a, b, c - - def test_preserves_original_metadata(self): - """Original metadata should be preserved in grouped result.""" - results = [ - SearchResult( - path="a.py", - score=0.5, - excerpt="def foo()", - metadata={"original_key": "original_value", "fusion_score": 0.5} - ), - SearchResult(path="b.py", score=0.5, excerpt="def foo()"), - ] - - grouped = group_similar_results(results, score_threshold_abs=0.1) - - assert grouped[0].metadata["original_key"] == "original_value" - assert grouped[0].metadata["fusion_score"] == 0.5 - - def test_results_sorted_by_score_descending(self): - """Final results should be sorted by score descending.""" - results = [ - SearchResult(path="low.py", score=0.1, excerpt="low"), - SearchResult(path="high.py", score=0.9, excerpt="high"), - SearchResult(path="mid.py", score=0.5, excerpt="mid"), - ] - - grouped = group_similar_results(results) - - scores = [r.score for r in grouped] - assert scores == sorted(scores, reverse=True) - assert scores == [0.9, 0.5, 0.1] - - -# ============================================================================= -# Level 5: Integration Tests with SearchOptions -# ============================================================================= - -class TestSearchOptionsGrouping: - """Integration tests for SearchOptions grouping configuration.""" - - def test_search_options_default_grouping_disabled(self): - """Default SearchOptions should have grouping disabled.""" - options = SearchOptions() - - assert options.group_results is False - assert options.grouping_threshold == 0.01 - - def test_search_options_enable_grouping(self): - """SearchOptions should allow enabling grouping.""" - options = SearchOptions(group_results=True) - - assert options.group_results is True - - def test_search_options_custom_threshold(self): - """SearchOptions should allow custom grouping threshold.""" - options = SearchOptions(group_results=True, grouping_threshold=0.05) - - assert options.grouping_threshold == 0.05 - - def test_search_options_all_parameters(self): - """SearchOptions should work with all parameters combined.""" - options = SearchOptions( - depth=3, - max_workers=4, - limit_per_dir=20, - total_limit=200, - include_symbols=True, - hybrid_mode=True, - group_results=True, - grouping_threshold=0.02, - ) - - assert options.depth == 3 - assert options.group_results is True - assert options.grouping_threshold == 0.02 - - -# ============================================================================= -# Level 6: AdditionalLocation Entity Tests -# ============================================================================= - -class TestAdditionalLocationEntity: - """Tests for AdditionalLocation entity model.""" - - def test_create_minimal_additional_location(self): - """Create AdditionalLocation with minimal required fields.""" - loc = AdditionalLocation(path="test.py", score=0.5) - - assert loc.path == "test.py" - assert loc.score == 0.5 - assert loc.start_line is None - assert loc.end_line is None - assert loc.symbol_name is None - - def test_create_full_additional_location(self): - """Create AdditionalLocation with all fields.""" - loc = AdditionalLocation( - path="test.py", - score=0.75, - start_line=10, - end_line=20, - symbol_name="my_function" - ) - - assert loc.path == "test.py" - assert loc.score == 0.75 - assert loc.start_line == 10 - assert loc.end_line == 20 - assert loc.symbol_name == "my_function" - - def test_additional_location_path_required(self): - """Path should be required for AdditionalLocation.""" - with pytest.raises(Exception): # ValidationError - AdditionalLocation(score=0.5) - - def test_additional_location_score_required(self): - """Score should be required for AdditionalLocation.""" - with pytest.raises(Exception): # ValidationError - AdditionalLocation(path="test.py") - - def test_additional_location_score_non_negative(self): - """Score should be non-negative.""" - with pytest.raises(Exception): # ValidationError - AdditionalLocation(path="test.py", score=-0.1) - - def test_additional_location_serialization(self): - """AdditionalLocation should serialize correctly.""" - loc = AdditionalLocation( - path="test.py", - score=0.5, - start_line=10, - symbol_name="func" - ) - - data = loc.model_dump() - - assert data["path"] == "test.py" - assert data["score"] == 0.5 - assert data["start_line"] == 10 - assert data["symbol_name"] == "func" - - -# ============================================================================= -# Level 7: SearchResult with AdditionalLocations Tests -# ============================================================================= - -class TestSearchResultWithAdditionalLocations: - """Tests for SearchResult entity with additional_locations field.""" - - def test_search_result_default_empty_locations(self): - """SearchResult should have empty additional_locations by default.""" - result = SearchResult(path="test.py", score=0.5) - - assert result.additional_locations == [] - - def test_search_result_with_additional_locations(self): - """SearchResult should accept additional_locations.""" - locations = [ - AdditionalLocation(path="other.py", score=0.4, start_line=5), - ] - - result = SearchResult( - path="main.py", - score=0.5, - additional_locations=locations - ) - - assert len(result.additional_locations) == 1 - assert result.additional_locations[0].path == "other.py" - - def test_search_result_serialization_with_locations(self): - """SearchResult with additional_locations should serialize correctly.""" - locations = [ - AdditionalLocation(path="loc1.py", score=0.4), - AdditionalLocation(path="loc2.py", score=0.3), - ] - - result = SearchResult( - path="main.py", - score=0.5, - excerpt="code", - additional_locations=locations - ) - - data = result.model_dump() - - assert len(data["additional_locations"]) == 2 - assert data["additional_locations"][0]["path"] == "loc1.py" - assert data["additional_locations"][1]["path"] == "loc2.py" - - -# ============================================================================= -# Level 8: Stress/Performance Tests -# ============================================================================= - -class TestGroupSimilarResultsPerformance: - """Performance and stress tests.""" - - def test_handles_large_result_set(self): - """Should handle large number of results efficiently.""" - # Create 1000 results with 100 different excerpts - results = [] - for i in range(1000): - excerpt_id = i % 100 - results.append(SearchResult( - path=f"file_{i}.py", - score=0.5 + (i % 10) * 0.01, # Scores vary slightly - excerpt=f"def func_{excerpt_id}(): pass", - start_line=i, - )) - - grouped = group_similar_results(results, score_threshold_abs=0.05) - - # Should reduce to approximately 100 groups (one per excerpt) - # with some variation due to score subgrouping - assert len(grouped) <= 200 - assert len(grouped) >= 50 # At least some grouping happened - - def test_handles_all_identical_results(self): - """Should handle case where all results are identical.""" - results = [ - SearchResult(path=f"file_{i}.py", score=0.5, excerpt="same code") - for i in range(100) - ] - - grouped = group_similar_results(results, score_threshold_abs=0.01) - - # All should be grouped into one - assert len(grouped) == 1 - assert len(grouped[0].additional_locations) == 99 - - def test_handles_all_unique_results(self): - """Should handle case where all results are unique.""" - results = [ - SearchResult(path=f"file_{i}.py", score=0.5, excerpt=f"unique_{i}") - for i in range(100) - ] - - grouped = group_similar_results(results, score_threshold_abs=0.01) - - # None should be grouped - assert len(grouped) == 100 - for r in grouped: - assert len(r.additional_locations) == 0 - - -# ============================================================================= -# Level 9: Real-world Scenario Tests -# ============================================================================= - -class TestGroupSimilarResultsRealWorld: - """Tests simulating real-world usage scenarios.""" - - def test_rrf_fusion_scores_grouping(self): - """Test with typical RRF fusion score ranges (0.001 - 0.02).""" - results = [ - SearchResult(path="auth/login.py", score=0.0164, excerpt="def authenticate():"), - SearchResult(path="auth/oauth.py", score=0.0163, excerpt="def authenticate():"), - SearchResult(path="auth/basic.py", score=0.0162, excerpt="def authenticate():"), - SearchResult(path="utils/helper.py", score=0.0082, excerpt="def helper():"), - ] - - # RRF scores are typically very small, use appropriate threshold - grouped = group_similar_results(results, score_threshold_abs=0.001) - - assert len(grouped) == 2 - - auth_group = next(r for r in grouped if "auth" in r.path) - assert len(auth_group.additional_locations) == 2 - - def test_duplicate_code_detection(self): - """Simulate detecting duplicate code across files.""" - duplicate_code = """ -def calculate_total(items): - return sum(item.price for item in items) -""" - results = [ - SearchResult(path="orders/service.py", score=0.5, excerpt=duplicate_code, start_line=45), - SearchResult(path="cart/calculator.py", score=0.5, excerpt=duplicate_code, start_line=12), - SearchResult(path="invoices/generator.py", score=0.5, excerpt=duplicate_code, start_line=78), - ] - - grouped = group_similar_results(results, score_threshold_abs=0.01) - - # All duplicates should be grouped - assert len(grouped) == 1 - assert len(grouped[0].additional_locations) == 2 - - # Can identify all locations - all_paths = {grouped[0].path} | {loc.path for loc in grouped[0].additional_locations} - assert all_paths == {"orders/service.py", "cart/calculator.py", "invoices/generator.py"} - - def test_mixed_relevance_results(self): - """Test with mixed relevance results typical of code search.""" - results = [ - # High relevance group - exact match - SearchResult(path="core.py", score=0.9, excerpt="def process():"), - SearchResult(path="core_v2.py", score=0.89, excerpt="def process():"), - # Medium relevance - partial match - SearchResult(path="utils.py", score=0.5, excerpt="def process_data():"), - # Low relevance - tangential - SearchResult(path="test.py", score=0.2, excerpt="def test_process():"), - ] - - grouped = group_similar_results(results, score_threshold_abs=0.02) - - # core.py and core_v2.py should group (same excerpt, similar score) - # Others should remain separate (different excerpts) - assert len(grouped) == 3 - - high_rel = next(r for r in grouped if r.score >= 0.89) - assert len(high_rel.additional_locations) == 1 diff --git a/codex-lens/tests/test_rrf_fusion.py b/codex-lens/tests/test_rrf_fusion.py deleted file mode 100644 index 762d4b54..00000000 --- a/codex-lens/tests/test_rrf_fusion.py +++ /dev/null @@ -1,584 +0,0 @@ -"""Tests for Reciprocal Rank Fusion (RRF) algorithm (P2). - -Tests RRF fusion logic, score computation, weight handling, and result ranking. -""" - -import math - -import pytest - -from codexlens.entities import SearchResult -from codexlens.search.ranking import ( - apply_symbol_boost, - QueryIntent, - detect_query_intent, - normalize_bm25_score, - normalize_weights, - reciprocal_rank_fusion, - rerank_results, - tag_search_source, -) - - -class TestReciprocalRankFusion: - """Tests for reciprocal_rank_fusion function.""" - - def test_single_source_ranking(self): - """Test RRF with single source returns ranked results.""" - results = [ - SearchResult(path="a.py", score=10.0, excerpt="..."), - SearchResult(path="b.py", score=8.0, excerpt="..."), - SearchResult(path="c.py", score=6.0, excerpt="..."), - ] - results_map = {"exact": results} - - fused = reciprocal_rank_fusion(results_map) - - assert len(fused) == 3 - # Order should be preserved (highest original score first) - assert fused[0].path == "a.py" - assert fused[1].path == "b.py" - assert fused[2].path == "c.py" - - def test_two_sources_fusion(self): - """Test RRF combines rankings from two sources.""" - exact_results = [ - SearchResult(path="a.py", score=10.0, excerpt="..."), - SearchResult(path="b.py", score=8.0, excerpt="..."), - SearchResult(path="c.py", score=6.0, excerpt="..."), - ] - fuzzy_results = [ - SearchResult(path="b.py", score=9.0, excerpt="..."), - SearchResult(path="c.py", score=7.0, excerpt="..."), - SearchResult(path="d.py", score=5.0, excerpt="..."), - ] - results_map = {"exact": exact_results, "fuzzy": fuzzy_results} - - fused = reciprocal_rank_fusion(results_map) - - # Should have all unique paths - paths = [r.path for r in fused] - assert set(paths) == {"a.py", "b.py", "c.py", "d.py"} - - # Results appearing in both should rank higher - # b.py and c.py appear in both sources - assert fused[0].path in ["b.py", "c.py"], "Items in both sources should rank highest" - - def test_rrf_score_calculation(self): - """Test RRF scores are calculated correctly with default k=60.""" - # Simple scenario: single source - results = [SearchResult(path="a.py", score=10.0, excerpt="...")] - results_map = {"exact": results} - - fused = reciprocal_rank_fusion(results_map, k=60) - - # RRF score = weight / (k + rank) = 1.0 / (60 + 1) ≈ 0.0164 - expected_score = 1.0 / 61 - assert abs(fused[0].score - expected_score) < 0.001 - - def test_custom_weights(self): - """Test custom weights affect RRF scores.""" - results_a = [SearchResult(path="a.py", score=10.0, excerpt="...")] - results_b = [SearchResult(path="a.py", score=10.0, excerpt="...")] - - results_map = {"exact": results_a, "fuzzy": results_b} - - # Higher weight for exact - weights = {"exact": 0.7, "fuzzy": 0.3} - fused = reciprocal_rank_fusion(results_map, weights=weights, k=60) - - # Score should be: 0.7/(60+1) + 0.3/(60+1) = 1.0/61 ≈ 0.0164 - expected_score = (0.7 + 0.3) / 61 - assert abs(fused[0].score - expected_score) < 0.001 - - def test_weight_normalization(self): - """Test weights are normalized to sum to 1.0.""" - results = [SearchResult(path="a.py", score=10.0, excerpt="...")] - results_map = {"exact": results} - - # Weights not summing to 1.0 - weights = {"exact": 2.0} # Will be normalized to 1.0 - fused = reciprocal_rank_fusion(results_map, weights=weights) - - # Should work without error and produce normalized scores - assert len(fused) == 1 - assert fused[0].score > 0 - - def test_empty_results_map(self): - """Test RRF with empty results returns empty list.""" - fused = reciprocal_rank_fusion({}) - assert fused == [] - - def test_zero_weight_source_ignored(self): - """Test sources with zero weight are ignored.""" - results_a = [SearchResult(path="a.py", score=10.0, excerpt="...")] - results_b = [SearchResult(path="b.py", score=10.0, excerpt="...")] - - results_map = {"exact": results_a, "fuzzy": results_b} - weights = {"exact": 1.0, "fuzzy": 0.0} # Ignore fuzzy - - fused = reciprocal_rank_fusion(results_map, weights=weights) - - # Should only have result from exact source - assert len(fused) == 1 - assert fused[0].path == "a.py" - - def test_fusion_score_in_metadata(self): - """Test fusion score is stored in result metadata.""" - results = [SearchResult(path="a.py", score=10.0, excerpt="...")] - results_map = {"exact": results} - - fused = reciprocal_rank_fusion(results_map) - - # Check metadata - assert "fusion_score" in fused[0].metadata - assert "original_score" in fused[0].metadata - assert fused[0].metadata["original_score"] == 10.0 - - def test_rank_order_matters(self): - """Test rank position affects RRF score (lower rank = higher score).""" - results = [ - SearchResult(path="a.py", score=10.0, excerpt="..."), # rank 1 - SearchResult(path="b.py", score=8.0, excerpt="..."), # rank 2 - SearchResult(path="c.py", score=6.0, excerpt="..."), # rank 3 - ] - results_map = {"exact": results} - - fused = reciprocal_rank_fusion(results_map, k=60) - - # a.py (rank 1): score = 1/(60+1) ≈ 0.0164 - # b.py (rank 2): score = 1/(60+2) ≈ 0.0161 - # c.py (rank 3): score = 1/(60+3) ≈ 0.0159 - assert fused[0].score > fused[1].score > fused[2].score - - -class TestRRFSyntheticRankings: - """Tests with synthetic rankings to verify RRF correctness.""" - - def test_perfect_agreement(self): - """Test RRF when all sources rank items identically.""" - # All sources rank a > b > c - exact = [ - SearchResult(path="a.py", score=10.0, excerpt="..."), - SearchResult(path="b.py", score=8.0, excerpt="..."), - SearchResult(path="c.py", score=6.0, excerpt="..."), - ] - fuzzy = [ - SearchResult(path="a.py", score=9.0, excerpt="..."), - SearchResult(path="b.py", score=7.0, excerpt="..."), - SearchResult(path="c.py", score=5.0, excerpt="..."), - ] - - results_map = {"exact": exact, "fuzzy": fuzzy} - fused = reciprocal_rank_fusion(results_map) - - # Order should match both sources - assert fused[0].path == "a.py" - assert fused[1].path == "b.py" - assert fused[2].path == "c.py" - - def test_complete_disagreement(self): - """Test RRF when sources have opposite rankings.""" - # exact: a > b > c - # fuzzy: c > b > a - exact = [ - SearchResult(path="a.py", score=10.0, excerpt="..."), - SearchResult(path="b.py", score=8.0, excerpt="..."), - SearchResult(path="c.py", score=6.0, excerpt="..."), - ] - fuzzy = [ - SearchResult(path="c.py", score=9.0, excerpt="..."), - SearchResult(path="b.py", score=7.0, excerpt="..."), - SearchResult(path="a.py", score=5.0, excerpt="..."), - ] - - results_map = {"exact": exact, "fuzzy": fuzzy} - fused = reciprocal_rank_fusion(results_map) - - # With opposite rankings, a.py and c.py get equal RRF scores: - # a.py: 0.5/(60+1) + 0.5/(60+3) = 0.01613 - # c.py: 0.5/(60+3) + 0.5/(60+1) = 0.01613 (same!) - # b.py: 0.5/(60+2) + 0.5/(60+2) = 0.01613 (slightly lower due to rounding) - # So top result should be a.py or c.py (tied) - assert fused[0].path in ["a.py", "c.py"], "Items with symmetric ranks should tie for first" - - def test_partial_overlap(self): - """Test RRF with partial overlap between sources.""" - # exact: [A, B, C] - # fuzzy: [B, C, D] - exact = [ - SearchResult(path="A", score=10.0, excerpt="..."), - SearchResult(path="B", score=8.0, excerpt="..."), - SearchResult(path="C", score=6.0, excerpt="..."), - ] - fuzzy = [ - SearchResult(path="B", score=9.0, excerpt="..."), - SearchResult(path="C", score=7.0, excerpt="..."), - SearchResult(path="D", score=5.0, excerpt="..."), - ] - - results_map = {"exact": exact, "fuzzy": fuzzy} - fused = reciprocal_rank_fusion(results_map) - - # B and C appear in both, should rank higher than A and D - paths = [r.path for r in fused] - b_idx = paths.index("B") - c_idx = paths.index("C") - a_idx = paths.index("A") - d_idx = paths.index("D") - - assert b_idx < a_idx, "B (in both) should outrank A (in one)" - assert c_idx < d_idx, "C (in both) should outrank D (in one)" - - def test_three_sources(self): - """Test RRF with three sources (exact, fuzzy, vector).""" - exact = [SearchResult(path="a.py", score=10.0, excerpt="...")] - fuzzy = [SearchResult(path="b.py", score=9.0, excerpt="...")] - vector = [SearchResult(path="c.py", score=8.0, excerpt="...")] - - results_map = {"exact": exact, "fuzzy": fuzzy, "vector": vector} - weights = {"exact": 0.3, "fuzzy": 0.1, "vector": 0.6} - - fused = reciprocal_rank_fusion(results_map, weights=weights) - - assert len(fused) == 3 - # Each appears in one source only, so scores differ by weights - # c.py: 0.6/61 ≈ 0.0098 (vector, highest weight) - # a.py: 0.3/61 ≈ 0.0049 (exact) - # b.py: 0.1/61 ≈ 0.0016 (fuzzy) - assert fused[0].path == "c.py", "Vector (higher weight) should rank first" - - -class TestNormalizeBM25Score: - """Tests for normalize_bm25_score function.""" - - def test_negative_bm25_normalization(self): - """Test BM25 scores (negative) are normalized to 0-1 range.""" - # SQLite FTS5 returns negative BM25 scores - scores = [-20.0, -10.0, -5.0, -1.0, 0.0] - - for score in scores: - normalized = normalize_bm25_score(score) - assert 0.0 <= normalized <= 1.0, f"Normalized score {normalized} out of range" - - def test_better_match_higher_score(self): - """Test more negative BM25 (better match) gives higher normalized score.""" - good_match = -15.0 - weak_match = -2.0 - - norm_good = normalize_bm25_score(good_match) - norm_weak = normalize_bm25_score(weak_match) - - assert norm_good > norm_weak, "Better match should have higher normalized score" - - def test_zero_score(self): - """Test zero BM25 score normalization.""" - normalized = normalize_bm25_score(0.0) - assert 0.0 <= normalized <= 1.0 - - def test_positive_score_handling(self): - """Test positive scores (edge case) are handled.""" - normalized = normalize_bm25_score(5.0) - # Should still be in valid range - assert 0.0 <= normalized <= 1.0 - - -class TestNormalizeWeights: - """Tests for normalize_weights function.""" - - def test_normalize_weights_with_nan(self): - """NaN total returns unchanged weights without division.""" - weights = {"exact": float("nan"), "fuzzy": None} - - normalized = normalize_weights(weights) - - assert normalized is not weights - assert set(normalized.keys()) == set(weights.keys()) - assert math.isnan(normalized["exact"]) - assert normalized["fuzzy"] is None - - def test_normalize_weights_with_infinity(self): - """Infinity total returns unchanged weights without division.""" - weights = {"exact": float("inf"), "fuzzy": None} - - normalized = normalize_weights(weights) - - assert normalized is not weights - assert normalized == weights - - def test_normalize_weights_with_all_none(self): - """All-None weights return unchanged weights without division.""" - weights = {"exact": None, "fuzzy": None} - - normalized = normalize_weights(weights) - - assert normalized is not weights - assert normalized == weights - - def test_normalize_weights_with_zero_total(self): - """Zero total returns unchanged weights without division.""" - weights = {"exact": 0.0, "fuzzy": 0.0} - - normalized = normalize_weights(weights) - - assert normalized is not weights - assert normalized == weights - - def test_normalize_weights_with_negative_total(self): - """Negative total returns unchanged weights without division.""" - weights = {"exact": -1.0, "fuzzy": -0.5} - - normalized = normalize_weights(weights) - - assert normalized is not weights - assert normalized == weights - - def test_normalize_weights_valid_total_normalizes(self): - """Valid finite positive total performs normalization correctly.""" - weights = {"exact": 2.0, "fuzzy": 1.0} - - normalized = normalize_weights(weights) - - assert normalized is not weights - assert normalized["exact"] == pytest.approx(2.0 / 3.0) - assert normalized["fuzzy"] == pytest.approx(1.0 / 3.0) - assert (normalized["exact"] + normalized["fuzzy"]) == pytest.approx(1.0) - - -class TestTagSearchSource: - """Tests for tag_search_source function.""" - - def test_tagging_adds_source_metadata(self): - """Test tagging adds search_source to metadata.""" - results = [ - SearchResult(path="a.py", score=10.0, excerpt="..."), - SearchResult(path="b.py", score=8.0, excerpt="..."), - ] - - tagged = tag_search_source(results, "exact") - - for result in tagged: - assert "search_source" in result.metadata - assert result.metadata["search_source"] == "exact" - - def test_tagging_preserves_existing_metadata(self): - """Test tagging preserves existing metadata fields.""" - results = [ - SearchResult( - path="a.py", - score=10.0, - excerpt="...", - metadata={"custom_field": "value"} - ), - ] - - tagged = tag_search_source(results, "fuzzy") - - assert "custom_field" in tagged[0].metadata - assert tagged[0].metadata["custom_field"] == "value" - assert "search_source" in tagged[0].metadata - assert tagged[0].metadata["search_source"] == "fuzzy" - - def test_tagging_empty_list(self): - """Test tagging empty list returns empty list.""" - tagged = tag_search_source([], "exact") - assert tagged == [] - - def test_tagging_preserves_result_fields(self): - """Test tagging preserves all SearchResult fields.""" - results = [ - SearchResult( - path="a.py", - score=10.0, - excerpt="test excerpt", - content="full content", - start_line=10, - end_line=20, - symbol_name="test_func", - symbol_kind="function" - ), - ] - - tagged = tag_search_source(results, "exact") - - assert tagged[0].path == "a.py" - assert tagged[0].score == 10.0 - assert tagged[0].excerpt == "test excerpt" - assert tagged[0].content == "full content" - assert tagged[0].start_line == 10 - assert tagged[0].end_line == 20 - assert tagged[0].symbol_name == "test_func" - assert tagged[0].symbol_kind == "function" - - -class TestSymbolBoost: - """Tests for apply_symbol_boost function.""" - - def test_symbol_boost(self): - results = [ - SearchResult(path="a.py", score=0.2, excerpt="...", symbol_name="foo"), - SearchResult(path="b.py", score=0.21, excerpt="..."), - ] - - boosted = apply_symbol_boost(results, boost_factor=1.5) - - assert boosted[0].path == "a.py" - assert boosted[0].score == pytest.approx(0.2 * 1.5) - assert boosted[0].metadata["boosted"] is True - assert boosted[0].metadata["original_fusion_score"] == pytest.approx(0.2) - - assert boosted[1].path == "b.py" - assert boosted[1].score == pytest.approx(0.21) - assert "boosted" not in boosted[1].metadata - - -class TestEmbeddingReranking: - """Tests for rerank_results embedding-based similarity.""" - - def test_rerank_embedding_similarity(self): - class DummyEmbedder: - def embed(self, texts): - if isinstance(texts, str): - texts = [texts] - mapping = { - "query": [1.0, 0.0], - "doc1": [1.0, 0.0], - "doc2": [0.0, 1.0], - } - return [mapping[t] for t in texts] - - results = [ - SearchResult(path="a.py", score=0.2, excerpt="doc1"), - SearchResult(path="b.py", score=0.9, excerpt="doc2"), - ] - - reranked = rerank_results("query", results, DummyEmbedder(), top_k=2) - - assert reranked[0].path == "a.py" - assert reranked[0].metadata["reranked"] is True - assert reranked[0].metadata["rrf_score"] == pytest.approx(0.2) - assert reranked[0].metadata["cosine_similarity"] == pytest.approx(1.0) - assert reranked[0].score == pytest.approx(0.5 * 0.2 + 0.5 * 1.0) - - assert reranked[1].path == "b.py" - assert reranked[1].metadata["reranked"] is True - assert reranked[1].metadata["rrf_score"] == pytest.approx(0.9) - assert reranked[1].metadata["cosine_similarity"] == pytest.approx(0.0) - assert reranked[1].score == pytest.approx(0.5 * 0.9 + 0.5 * 0.0) - - -@pytest.mark.parametrize("k_value", [30, 60, 100]) -class TestRRFParameterized: - """Parameterized tests for RRF with different k values.""" - - def test_k_value_affects_scores(self, k_value): - """Test k parameter affects RRF score magnitude.""" - results = [SearchResult(path="a.py", score=10.0, excerpt="...")] - results_map = {"exact": results} - - fused = reciprocal_rank_fusion(results_map, k=k_value) - - # Score should be 1.0 / (k + 1) - expected = 1.0 / (k_value + 1) - assert abs(fused[0].score - expected) < 0.001 - - -class TestRRFEdgeCases: - """Edge case tests for RRF.""" - - def test_duplicate_paths_in_same_source(self): - """Test handling of duplicate paths in single source.""" - results = [ - SearchResult(path="a.py", score=10.0, excerpt="..."), - SearchResult(path="a.py", score=8.0, excerpt="..."), # Duplicate - ] - results_map = {"exact": results} - - fused = reciprocal_rank_fusion(results_map) - - # Should deduplicate (first occurrence wins) - assert len(fused) == 1 - assert fused[0].path == "a.py" - - def test_very_large_result_lists(self): - """Test RRF handles large result sets efficiently.""" - # Create 1000 results - results = [ - SearchResult(path=f"file{i}.py", score=1000-i, excerpt="...") - for i in range(1000) - ] - results_map = {"exact": results} - - fused = reciprocal_rank_fusion(results_map) - - assert len(fused) == 1000 - # Should maintain ranking - assert fused[0].path == "file0.py" - assert fused[-1].path == "file999.py" - - def test_all_same_score(self): - """Test RRF when all results have same original score.""" - results = [ - SearchResult(path="a.py", score=10.0, excerpt="..."), - SearchResult(path="b.py", score=10.0, excerpt="..."), - SearchResult(path="c.py", score=10.0, excerpt="..."), - ] - results_map = {"exact": results} - - fused = reciprocal_rank_fusion(results_map) - - # Should still rank by position (rank matters) - assert len(fused) == 3 - assert fused[0].score > fused[1].score > fused[2].score - - def test_missing_weight_for_source(self): - """Test missing weight for source uses default.""" - results = [SearchResult(path="a.py", score=10.0, excerpt="...")] - results_map = {"exact": results, "fuzzy": results} - - # Only provide weight for exact - weights = {"exact": 1.0} - - fused = reciprocal_rank_fusion(results_map, weights=weights) - - # Should work with normalization - assert len(fused) == 1 # Deduplicated - assert fused[0].score > 0 - - -class TestSymbolBoostAndIntentV1: - """Tests for symbol boosting and query intent detection (v1.0).""" - - def test_symbol_boost_application(self): - """Results with symbol_name receive a multiplicative boost (default 1.5x).""" - results = [ - SearchResult(path="a.py", score=0.4, excerpt="...", symbol_name="AuthManager"), - SearchResult(path="b.py", score=0.41, excerpt="..."), - ] - - boosted = apply_symbol_boost(results, boost_factor=1.5) - - assert boosted[0].score == pytest.approx(0.4 * 1.5) - assert boosted[0].metadata["boosted"] is True - assert boosted[0].metadata["original_fusion_score"] == pytest.approx(0.4) - assert boosted[1].score == pytest.approx(0.41) - assert "boosted" not in boosted[1].metadata - - @pytest.mark.parametrize( - ("query", "expected"), - [ - ("def authenticate", QueryIntent.KEYWORD), - ("MyClass", QueryIntent.KEYWORD), - ("user_id", QueryIntent.KEYWORD), - ("UserService::authenticate", QueryIntent.KEYWORD), - ("ptr->next", QueryIntent.KEYWORD), - ("how to handle user login", QueryIntent.SEMANTIC), - ("what is authentication?", QueryIntent.SEMANTIC), - ("where is this used?", QueryIntent.SEMANTIC), - ("why does FooBar crash?", QueryIntent.MIXED), - ("how to use user_id in query", QueryIntent.MIXED), - ], - ) - def test_query_intent_detection(self, query, expected): - """Detect intent for representative queries (Python/TypeScript parity).""" - assert detect_query_intent(query) == expected diff --git a/codex-lens/tests/test_schema_cleanup_migration.py b/codex-lens/tests/test_schema_cleanup_migration.py deleted file mode 100644 index e7848f33..00000000 --- a/codex-lens/tests/test_schema_cleanup_migration.py +++ /dev/null @@ -1,308 +0,0 @@ -""" -Test migration 005: Schema cleanup for unused/redundant fields. - -Tests that migration 005 successfully removes: -1. semantic_metadata.keywords (replaced by file_keywords) -2. symbols.token_count (unused) -3. symbols.symbol_type (redundant with kind) -4. subdirs.direct_files (unused) -""" - -import sqlite3 -import tempfile -from pathlib import Path - -import pytest - -from codexlens.storage.dir_index import DirIndexStore -from codexlens.entities import Symbol - - -class TestSchemaCleanupMigration: - """Test schema cleanup migration (v4 -> latest).""" - - def test_migration_from_v4_to_v5(self): - """Test that migration successfully removes deprecated fields.""" - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "_index.db" - store = DirIndexStore(db_path) - - # Create v4 schema manually (with deprecated fields) - conn = sqlite3.connect(db_path) - conn.row_factory = sqlite3.Row - cursor = conn.cursor() - - # Set schema version to 4 - cursor.execute("PRAGMA user_version = 4") - - # Create v4 schema with deprecated fields - cursor.execute(""" - CREATE TABLE files ( - id INTEGER PRIMARY KEY, - name TEXT NOT NULL, - full_path TEXT UNIQUE NOT NULL, - language TEXT, - content TEXT, - mtime REAL, - line_count INTEGER - ) - """) - - cursor.execute(""" - CREATE TABLE subdirs ( - id INTEGER PRIMARY KEY, - name TEXT NOT NULL UNIQUE, - index_path TEXT NOT NULL, - files_count INTEGER DEFAULT 0, - direct_files INTEGER DEFAULT 0, - last_updated REAL - ) - """) - - cursor.execute(""" - CREATE TABLE symbols ( - id INTEGER PRIMARY KEY, - file_id INTEGER REFERENCES files(id) ON DELETE CASCADE, - name TEXT NOT NULL, - kind TEXT NOT NULL, - start_line INTEGER, - end_line INTEGER, - token_count INTEGER, - symbol_type TEXT - ) - """) - - cursor.execute(""" - CREATE TABLE semantic_metadata ( - id INTEGER PRIMARY KEY, - file_id INTEGER UNIQUE REFERENCES files(id) ON DELETE CASCADE, - summary TEXT, - keywords TEXT, - purpose TEXT, - llm_tool TEXT, - generated_at REAL - ) - """) - - cursor.execute(""" - CREATE TABLE keywords ( - id INTEGER PRIMARY KEY, - keyword TEXT NOT NULL UNIQUE - ) - """) - - cursor.execute(""" - CREATE TABLE file_keywords ( - file_id INTEGER NOT NULL, - keyword_id INTEGER NOT NULL, - PRIMARY KEY (file_id, keyword_id), - FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE, - FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE - ) - """) - - # Insert test data - cursor.execute( - "INSERT INTO files (name, full_path, language, content, mtime, line_count) VALUES (?, ?, ?, ?, ?, ?)", - ("test.py", "/test/test.py", "python", "def test(): pass", 1234567890.0, 1) - ) - file_id = cursor.lastrowid - - cursor.execute( - "INSERT INTO symbols (file_id, name, kind, start_line, end_line, token_count, symbol_type) VALUES (?, ?, ?, ?, ?, ?, ?)", - (file_id, "test", "function", 1, 1, 10, "function") - ) - - cursor.execute( - "INSERT INTO semantic_metadata (file_id, summary, keywords, purpose, llm_tool, generated_at) VALUES (?, ?, ?, ?, ?, ?)", - (file_id, "Test function", '["test", "example"]', "Testing", "gemini", 1234567890.0) - ) - - cursor.execute( - "INSERT INTO subdirs (name, index_path, files_count, direct_files, last_updated) VALUES (?, ?, ?, ?, ?)", - ("subdir", "/test/subdir/_index.db", 5, 2, 1234567890.0) - ) - - conn.commit() - conn.close() - - # Now initialize store - this should trigger migration - store.initialize() - - # Verify schema version is now the latest - conn = store._get_connection() - version_row = conn.execute("PRAGMA user_version").fetchone() - assert version_row[0] == DirIndexStore.SCHEMA_VERSION, ( - f"Expected schema version {DirIndexStore.SCHEMA_VERSION}, got {version_row[0]}" - ) - - # Check that deprecated columns are removed - # 1. Check semantic_metadata doesn't have keywords column - cursor = conn.execute("PRAGMA table_info(semantic_metadata)") - columns = {row[1] for row in cursor.fetchall()} - assert "keywords" not in columns, "semantic_metadata.keywords should be removed" - assert "summary" in columns, "semantic_metadata.summary should exist" - assert "purpose" in columns, "semantic_metadata.purpose should exist" - - # 2. Check symbols doesn't have token_count or symbol_type - cursor = conn.execute("PRAGMA table_info(symbols)") - columns = {row[1] for row in cursor.fetchall()} - assert "token_count" not in columns, "symbols.token_count should be removed" - assert "symbol_type" not in columns, "symbols.symbol_type should be removed" - assert "kind" in columns, "symbols.kind should exist" - - # 3. Check subdirs doesn't have direct_files - cursor = conn.execute("PRAGMA table_info(subdirs)") - columns = {row[1] for row in cursor.fetchall()} - assert "direct_files" not in columns, "subdirs.direct_files should be removed" - assert "files_count" in columns, "subdirs.files_count should exist" - - # 4. Verify data integrity - data should be preserved - semantic = store.get_semantic_metadata(file_id) - assert semantic is not None, "Semantic metadata should be preserved" - assert semantic["summary"] == "Test function" - assert semantic["purpose"] == "Testing" - # Keywords should now come from file_keywords table (empty after migration since we didn't populate it) - assert isinstance(semantic["keywords"], list) - - store.close() - - def test_new_database_has_clean_schema(self): - """Test that new databases are created with clean schema (latest).""" - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - - conn = store._get_connection() - - # Verify schema version is the latest - version_row = conn.execute("PRAGMA user_version").fetchone() - assert version_row[0] == DirIndexStore.SCHEMA_VERSION - - # Check that new schema doesn't have deprecated columns - cursor = conn.execute("PRAGMA table_info(semantic_metadata)") - columns = {row[1] for row in cursor.fetchall()} - assert "keywords" not in columns - - cursor = conn.execute("PRAGMA table_info(symbols)") - columns = {row[1] for row in cursor.fetchall()} - assert "token_count" not in columns - assert "symbol_type" not in columns - - cursor = conn.execute("PRAGMA table_info(subdirs)") - columns = {row[1] for row in cursor.fetchall()} - assert "direct_files" not in columns - - store.close() - - def test_semantic_metadata_keywords_from_normalized_table(self): - """Test that keywords are read from file_keywords table, not JSON column.""" - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - - # Add a file - file_id = store.add_file( - name="test.py", - full_path="/test/test.py", - content="def test(): pass", - language="python", - symbols=[] - ) - - # Add semantic metadata with keywords - store.add_semantic_metadata( - file_id=file_id, - summary="Test function", - keywords=["test", "example", "function"], - purpose="Testing", - llm_tool="gemini" - ) - - # Retrieve and verify keywords come from normalized table - semantic = store.get_semantic_metadata(file_id) - assert semantic is not None - assert sorted(semantic["keywords"]) == ["example", "function", "test"] - - # Verify keywords are in normalized tables - conn = store._get_connection() - keyword_count = conn.execute( - """SELECT COUNT(*) FROM file_keywords WHERE file_id = ?""", - (file_id,) - ).fetchone()[0] - assert keyword_count == 3 - - store.close() - - def test_symbols_insert_without_deprecated_fields(self): - """Test that symbols can be inserted without token_count and symbol_type.""" - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - - # Add file with symbols - symbols = [ - Symbol(name="test_func", kind="function", range=(1, 5)), - Symbol(name="TestClass", kind="class", range=(7, 20)), - ] - - file_id = store.add_file( - name="test.py", - full_path="/test/test.py", - content="def test_func(): pass\n\nclass TestClass:\n pass", - language="python", - symbols=symbols - ) - - # Verify symbols were inserted - conn = store._get_connection() - symbol_rows = conn.execute( - "SELECT name, kind, start_line, end_line FROM symbols WHERE file_id = ?", - (file_id,) - ).fetchall() - - assert len(symbol_rows) == 2 - assert symbol_rows[0]["name"] == "test_func" - assert symbol_rows[0]["kind"] == "function" - assert symbol_rows[1]["name"] == "TestClass" - assert symbol_rows[1]["kind"] == "class" - - store.close() - - def test_subdir_operations_without_direct_files(self): - """Test that subdir operations work without direct_files field.""" - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - - # Register subdir (direct_files parameter is ignored) - store.register_subdir( - name="subdir", - index_path="/test/subdir/_index.db", - files_count=10, - direct_files=5 # This should be ignored - ) - - # Retrieve and verify - subdir = store.get_subdir("subdir") - assert subdir is not None - assert subdir.name == "subdir" - assert subdir.files_count == 10 - assert not hasattr(subdir, "direct_files") # Should not have this attribute - - # Update stats (direct_files parameter is ignored) - store.update_subdir_stats("subdir", files_count=15, direct_files=7) - - # Verify update - subdir = store.get_subdir("subdir") - assert subdir.files_count == 15 - - store.close() - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/codex-lens/tests/test_search_comparison.py b/codex-lens/tests/test_search_comparison.py deleted file mode 100644 index 878dea23..00000000 --- a/codex-lens/tests/test_search_comparison.py +++ /dev/null @@ -1,540 +0,0 @@ -"""Comprehensive comparison test for vector search vs hybrid search. - -This test diagnoses why vector search returns empty results and compares -performance between different search modes. -""" - -import json -import sqlite3 -import tempfile -import time -from pathlib import Path -from typing import Dict, List, Any - -import pytest - -from codexlens.entities import SearchResult -from codexlens.search.hybrid_search import HybridSearchEngine -from codexlens.storage.dir_index import DirIndexStore - -# Check semantic search availability -try: - from codexlens.semantic.embedder import Embedder - from codexlens.semantic.vector_store import VectorStore - from codexlens.semantic import SEMANTIC_AVAILABLE - SEMANTIC_DEPS_AVAILABLE = SEMANTIC_AVAILABLE -except ImportError: - SEMANTIC_DEPS_AVAILABLE = False - - -class TestSearchComparison: - """Comprehensive comparison of search modes.""" - - @pytest.fixture - def sample_project_db(self): - """Create sample project database with semantic chunks.""" - with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdir: - db_path = Path(tmpdir) / "_index.db" - - store = DirIndexStore(db_path) - store.initialize() - - # Sample files with varied content for testing - sample_files = { - "src/auth/authentication.py": """ -def authenticate_user(username: str, password: str) -> bool: - '''Authenticate user with credentials using bcrypt hashing. - - This function validates user credentials against the database - and returns True if authentication succeeds. - ''' - hashed = hash_password(password) - return verify_credentials(username, hashed) - -def hash_password(password: str) -> str: - '''Hash password using bcrypt algorithm.''' - import bcrypt - return bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode() - -def verify_credentials(user: str, pwd_hash: str) -> bool: - '''Verify user credentials against database.''' - # Database verification logic - return True -""", - "src/auth/authorization.py": """ -def authorize_action(user_id: int, resource: str, action: str) -> bool: - '''Authorize user action on resource using role-based access control. - - Checks if user has permission to perform action on resource - based on their assigned roles. - ''' - roles = get_user_roles(user_id) - permissions = get_role_permissions(roles) - return has_permission(permissions, resource, action) - -def get_user_roles(user_id: int) -> List[str]: - '''Fetch user roles from database.''' - return ["user", "admin"] - -def has_permission(permissions, resource, action) -> bool: - '''Check if permissions allow action on resource.''' - return True -""", - "src/models/user.py": """ -from dataclasses import dataclass -from typing import Optional - -@dataclass -class User: - '''User model representing application users. - - Stores user profile information and authentication state. - ''' - id: int - username: str - email: str - password_hash: str - is_active: bool = True - - def authenticate(self, password: str) -> bool: - '''Authenticate this user with password.''' - from auth.authentication import verify_credentials - return verify_credentials(self.username, password) - - def has_role(self, role: str) -> bool: - '''Check if user has specific role.''' - return True -""", - "src/api/user_api.py": """ -from flask import Flask, request, jsonify -from models.user import User - -app = Flask(__name__) - -@app.route('/api/user/', methods=['GET']) -def get_user(user_id: int): - '''Get user by ID from database. - - Returns user profile information as JSON. - ''' - user = User.query.get(user_id) - return jsonify(user.to_dict()) - -@app.route('/api/user/login', methods=['POST']) -def login(): - '''User login endpoint using username and password. - - Authenticates user and returns session token. - ''' - data = request.json - username = data.get('username') - password = data.get('password') - - if authenticate_user(username, password): - token = generate_session_token(username) - return jsonify({'token': token}) - return jsonify({'error': 'Invalid credentials'}), 401 -""", - "tests/test_auth.py": """ -import pytest -from auth.authentication import authenticate_user, hash_password - -class TestAuthentication: - '''Test authentication functionality.''' - - def test_authenticate_valid_user(self): - '''Test authentication with valid credentials.''' - assert authenticate_user("testuser", "password123") == True - - def test_authenticate_invalid_user(self): - '''Test authentication with invalid credentials.''' - assert authenticate_user("invalid", "wrong") == False - - def test_password_hashing(self): - '''Test password hashing produces unique hashes.''' - hash1 = hash_password("password") - hash2 = hash_password("password") - assert hash1 != hash2 # Salts should differ -""", - } - - # Insert files into database - with store._get_connection() as conn: - for file_path, content in sample_files.items(): - name = file_path.split('/')[-1] - lang = "python" - conn.execute( - """INSERT INTO files (name, full_path, content, language, mtime) - VALUES (?, ?, ?, ?, ?)""", - (name, file_path, content, lang, time.time()) - ) - conn.commit() - - yield db_path - store.close() - - def _check_semantic_chunks_table(self, db_path: Path) -> Dict[str, Any]: - """Check if semantic_chunks table exists and has data.""" - with sqlite3.connect(db_path) as conn: - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'" - ) - table_exists = cursor.fetchone() is not None - - chunk_count = 0 - if table_exists: - cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks") - chunk_count = cursor.fetchone()[0] - - return { - "table_exists": table_exists, - "chunk_count": chunk_count, - } - - def _create_vector_index(self, db_path: Path) -> Dict[str, Any]: - """Create vector embeddings for indexed files.""" - if not SEMANTIC_DEPS_AVAILABLE: - return { - "success": False, - "error": "Semantic dependencies not available", - "chunks_created": 0, - } - - try: - from codexlens.semantic.chunker import Chunker, ChunkConfig - - # Initialize embedder and vector store - embedder = Embedder(profile="code") - vector_store = VectorStore(db_path) - chunker = Chunker(config=ChunkConfig(max_chunk_size=2000)) - - # Read files from database - with sqlite3.connect(db_path) as conn: - conn.row_factory = sqlite3.Row - cursor = conn.execute("SELECT full_path, content FROM files") - files = cursor.fetchall() - - chunks_created = 0 - for file_row in files: - file_path = file_row["full_path"] - content = file_row["content"] - - # Create semantic chunks using sliding window - chunks = chunker.chunk_sliding_window( - content, - file_path=file_path, - language="python" - ) - - # Generate embeddings - for chunk in chunks: - embedding = embedder.embed_single(chunk.content) - chunk.embedding = embedding - - # Store chunks - if chunks: # Only store if we have chunks - vector_store.add_chunks(chunks, file_path) - chunks_created += len(chunks) - - return { - "success": True, - "chunks_created": chunks_created, - "files_processed": len(files), - } - except Exception as exc: - return { - "success": False, - "error": str(exc), - "chunks_created": 0, - } - - def _run_search_mode( - self, - db_path: Path, - query: str, - mode: str, - limit: int = 10, - ) -> Dict[str, Any]: - """Run search in specified mode and collect metrics.""" - engine = HybridSearchEngine() - - # Map mode to parameters - pure_vector = False - if mode == "exact": - enable_fuzzy, enable_vector = False, False - elif mode == "fuzzy": - enable_fuzzy, enable_vector = True, False - elif mode == "vector": - enable_fuzzy, enable_vector = False, True - pure_vector = True # Use pure vector mode for vector-only search - elif mode == "hybrid": - enable_fuzzy, enable_vector = True, True - else: - raise ValueError(f"Invalid mode: {mode}") - - # Measure search time - start_time = time.time() - try: - results = engine.search( - db_path, - query, - limit=limit, - enable_fuzzy=enable_fuzzy, - enable_vector=enable_vector, - pure_vector=pure_vector, - ) - elapsed_ms = (time.time() - start_time) * 1000 - - return { - "success": True, - "mode": mode, - "query": query, - "result_count": len(results), - "elapsed_ms": elapsed_ms, - "results": [ - { - "path": r.path, - "score": r.score, - "excerpt": r.excerpt[:100] if r.excerpt else "", - "source": getattr(r, "search_source", None), - } - for r in results[:5] # Top 5 results - ], - } - except Exception as exc: - elapsed_ms = (time.time() - start_time) * 1000 - return { - "success": False, - "mode": mode, - "query": query, - "error": str(exc), - "elapsed_ms": elapsed_ms, - "result_count": 0, - } - - @pytest.mark.skipif(not SEMANTIC_DEPS_AVAILABLE, reason="Semantic dependencies not available") - def test_full_search_comparison_with_vectors(self, sample_project_db): - """Complete search comparison test with vector embeddings.""" - db_path = sample_project_db - - # Step 1: Check initial state - print("\n=== Step 1: Checking initial database state ===") - initial_state = self._check_semantic_chunks_table(db_path) - print(f"Table exists: {initial_state['table_exists']}") - print(f"Chunk count: {initial_state['chunk_count']}") - - # Step 2: Create vector index - print("\n=== Step 2: Creating vector embeddings ===") - vector_result = self._create_vector_index(db_path) - print(f"Success: {vector_result['success']}") - if vector_result['success']: - print(f"Chunks created: {vector_result['chunks_created']}") - print(f"Files processed: {vector_result['files_processed']}") - else: - print(f"Error: {vector_result.get('error', 'Unknown')}") - - # Step 3: Verify vector index was created - print("\n=== Step 3: Verifying vector index ===") - final_state = self._check_semantic_chunks_table(db_path) - print(f"Table exists: {final_state['table_exists']}") - print(f"Chunk count: {final_state['chunk_count']}") - - # Step 4: Run comparison tests - print("\n=== Step 4: Running search mode comparison ===") - test_queries = [ - "authenticate user credentials", # Semantic query - "authentication", # Keyword query - "password hashing bcrypt", # Multi-term query - ] - - comparison_results = [] - for query in test_queries: - print(f"\n--- Query: '{query}' ---") - for mode in ["exact", "fuzzy", "vector", "hybrid"]: - result = self._run_search_mode(db_path, query, mode, limit=10) - comparison_results.append(result) - - print(f"\n{mode.upper()} mode:") - print(f" Success: {result['success']}") - print(f" Results: {result['result_count']}") - print(f" Time: {result['elapsed_ms']:.2f}ms") - if result['success'] and result['result_count'] > 0: - print(f" Top result: {result['results'][0]['path']}") - print(f" Score: {result['results'][0]['score']:.3f}") - print(f" Source: {result['results'][0]['source']}") - elif not result['success']: - print(f" Error: {result.get('error', 'Unknown')}") - - # Step 5: Generate comparison report - print("\n=== Step 5: Comparison Summary ===") - - # Group by mode - mode_stats = {} - for result in comparison_results: - mode = result['mode'] - if mode not in mode_stats: - mode_stats[mode] = { - "total_searches": 0, - "successful_searches": 0, - "total_results": 0, - "total_time_ms": 0, - "empty_results": 0, - } - - stats = mode_stats[mode] - stats["total_searches"] += 1 - if result['success']: - stats["successful_searches"] += 1 - stats["total_results"] += result['result_count'] - if result['result_count'] == 0: - stats["empty_results"] += 1 - stats["total_time_ms"] += result['elapsed_ms'] - - # Print summary table - print("\nMode | Queries | Success | Avg Results | Avg Time | Empty Results") - print("-" * 75) - for mode in ["exact", "fuzzy", "vector", "hybrid"]: - if mode in mode_stats: - stats = mode_stats[mode] - avg_results = stats["total_results"] / stats["total_searches"] - avg_time = stats["total_time_ms"] / stats["total_searches"] - print( - f"{mode:9} | {stats['total_searches']:7} | " - f"{stats['successful_searches']:7} | {avg_results:11.1f} | " - f"{avg_time:8.1f}ms | {stats['empty_results']:13}" - ) - - # Assertions - assert initial_state is not None - if vector_result['success']: - assert final_state['chunk_count'] > 0, "Vector index should contain chunks" - - # Find vector search results - vector_results = [r for r in comparison_results if r['mode'] == 'vector'] - if vector_results: - # At least one vector search should return results if index was created - has_vector_results = any(r.get('result_count', 0) > 0 for r in vector_results) - if not has_vector_results: - print("\n⚠️ WARNING: Vector index created but vector search returned no results!") - print("This indicates a potential issue with vector search implementation.") - - def test_search_comparison_without_vectors(self, sample_project_db): - """Search comparison test without vector embeddings (baseline).""" - db_path = sample_project_db - - print("\n=== Testing search without vector embeddings ===") - - # Check state - state = self._check_semantic_chunks_table(db_path) - print(f"Semantic chunks table exists: {state['table_exists']}") - print(f"Chunk count: {state['chunk_count']}") - - # Run exact and fuzzy searches only - test_queries = ["authentication", "user password", "bcrypt hash"] - - for query in test_queries: - print(f"\n--- Query: '{query}' ---") - for mode in ["exact", "fuzzy"]: - result = self._run_search_mode(db_path, query, mode, limit=10) - - print(f"{mode.upper()}: {result['result_count']} results in {result['elapsed_ms']:.2f}ms") - if result['success'] and result['result_count'] > 0: - print(f" Top: {result['results'][0]['path']} (score: {result['results'][0]['score']:.3f})") - - # Test vector search without embeddings (should return empty) - print(f"\n--- Testing vector search without embeddings ---") - vector_result = self._run_search_mode(db_path, "authentication", "vector", limit=10) - print(f"Vector search result count: {vector_result['result_count']}") - print(f"This is expected to be 0 without embeddings: {vector_result['result_count'] == 0}") - - assert vector_result['result_count'] == 0, \ - "Vector search should return empty results when no embeddings exist" - - -class TestDiagnostics: - """Diagnostic tests to identify specific issues.""" - - @pytest.fixture - def empty_db(self): - """Create empty database.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - db_path = Path(f.name) - - store = DirIndexStore(db_path) - store.initialize() - store.close() - - yield db_path - if db_path.exists(): - for attempt in range(5): - try: - db_path.unlink() - break - except PermissionError: - time.sleep(0.05 * (attempt + 1)) - else: - # Best-effort cleanup (Windows SQLite locks can linger briefly). - try: - db_path.unlink(missing_ok=True) - except (PermissionError, OSError): - pass - - def test_diagnose_empty_database(self, empty_db): - """Diagnose behavior with empty database.""" - engine = HybridSearchEngine() - - print("\n=== Diagnosing empty database ===") - - # Test all modes - for mode_config in [ - ("exact", False, False), - ("fuzzy", True, False), - ("vector", False, True), - ("hybrid", True, True), - ]: - mode, enable_fuzzy, enable_vector = mode_config - - try: - results = engine.search( - empty_db, - "test", - limit=10, - enable_fuzzy=enable_fuzzy, - enable_vector=enable_vector, - ) - print(f"{mode}: {len(results)} results (OK)") - assert isinstance(results, list) - assert len(results) == 0 - except Exception as exc: - print(f"{mode}: ERROR - {exc}") - # Should not raise errors, should return empty list - pytest.fail(f"Search mode '{mode}' raised exception on empty database: {exc}") - - @pytest.mark.skipif(not SEMANTIC_DEPS_AVAILABLE, reason="Semantic dependencies not available") - def test_diagnose_embedder_initialization(self): - """Test embedder initialization and embedding generation.""" - print("\n=== Diagnosing embedder ===") - - try: - embedder = Embedder(profile="code") - print(f"✓ Embedder initialized (model: {embedder.model_name})") - print(f" Embedding dimension: {embedder.embedding_dim}") - - # Test embedding generation - test_text = "def authenticate_user(username, password):" - embedding = embedder.embed_single(test_text) - - print(f"✓ Generated embedding (length: {len(embedding)})") - print(f" Sample values: {embedding[:5]}") - - assert len(embedding) == embedder.embedding_dim - assert all(isinstance(v, float) for v in embedding) - - except Exception as exc: - print(f"✗ Embedder error: {exc}") - raise - - -if __name__ == "__main__": - # Run tests with pytest - pytest.main([__file__, "-v", "-s"]) diff --git a/codex-lens/tests/test_search_comprehensive.py b/codex-lens/tests/test_search_comprehensive.py deleted file mode 100644 index dcde8e9a..00000000 --- a/codex-lens/tests/test_search_comprehensive.py +++ /dev/null @@ -1,604 +0,0 @@ -"""Comprehensive tests for CodexLens search functionality. - -Tests cover: -- FTS5 text search (basic, phrase, boolean, wildcard) -- Chain search across directories -- Symbol search (by name, kind, filters) -- Files-only search mode -- Edge cases and error handling -""" - -import tempfile -import pytest -from pathlib import Path -from unittest.mock import MagicMock, patch - -from codexlens.storage.sqlite_store import SQLiteStore -from codexlens.storage.dir_index import DirIndexStore -from codexlens.storage.registry import RegistryStore -from codexlens.storage.path_mapper import PathMapper -from codexlens.search import ( - ChainSearchEngine, - SearchOptions, - SearchStats, - ChainSearchResult, - quick_search, -) -from codexlens.entities import IndexedFile, Symbol, SearchResult - - -# === Fixtures === - -@pytest.fixture -def temp_dir(): - """Create a temporary directory.""" - with tempfile.TemporaryDirectory() as tmpdir: - yield Path(tmpdir) - - -@pytest.fixture -def sample_files(): - """Sample file data for testing.""" - return [ - (IndexedFile( - path="/project/src/auth.py", - language="python", - symbols=[ - Symbol(name="authenticate", kind="function", range=(1, 10)), - Symbol(name="verify_token", kind="function", range=(12, 20)), - Symbol(name="AuthManager", kind="class", range=(22, 50)), - ], - ), """ -def authenticate(username, password): - '''Authenticate user with credentials.''' - user = find_user(username) - if user and check_password(user, password): - return create_token(user) - return None - -def verify_token(token): - '''Verify JWT token validity.''' - try: - payload = decode_token(token) - return payload - except TokenExpired: - return None - -class AuthManager: - '''Manages authentication state.''' - def __init__(self): - self.sessions = {} - - def login(self, user): - token = authenticate(user.name, user.password) - self.sessions[user.id] = token - return token -"""), - (IndexedFile( - path="/project/src/database.py", - language="python", - symbols=[ - Symbol(name="connect", kind="function", range=(1, 5)), - Symbol(name="query", kind="function", range=(7, 15)), - Symbol(name="DatabasePool", kind="class", range=(17, 40)), - ], - ), """ -def connect(host, port, database): - '''Establish database connection.''' - return Connection(host, port, database) - -def query(connection, sql, params=None): - '''Execute SQL query and return results.''' - cursor = connection.cursor() - cursor.execute(sql, params or []) - return cursor.fetchall() - -class DatabasePool: - '''Connection pool for database.''' - def __init__(self, size=10): - self.pool = [] - self.size = size - - def get_connection(self): - if self.pool: - return self.pool.pop() - return connect() -"""), - (IndexedFile( - path="/project/src/utils.py", - language="python", - symbols=[ - Symbol(name="format_date", kind="function", range=(1, 3)), - Symbol(name="parse_json", kind="function", range=(5, 10)), - Symbol(name="hash_password", kind="function", range=(12, 18)), - ], - ), """ -def format_date(date, fmt='%Y-%m-%d'): - return date.strftime(fmt) - -def parse_json(data): - '''Parse JSON string to dictionary.''' - import json - return json.loads(data) - -def hash_password(password, salt=None): - '''Hash password using bcrypt.''' - import hashlib - salt = salt or generate_salt() - return hashlib.sha256((password + salt).encode()).hexdigest() -"""), - ] - - -@pytest.fixture -def populated_store(temp_dir, sample_files): - """Create a populated SQLite store for testing.""" - db_path = temp_dir / "_index.db" - store = SQLiteStore(db_path) - store.initialize() - - for indexed_file, content in sample_files: - store.add_file(indexed_file, content) - - yield store - store.close() - - -@pytest.fixture -def populated_dir_store(temp_dir, sample_files): - """Create a populated DirIndexStore for testing.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - - for indexed_file, content in sample_files: - store.add_file(indexed_file, content) - - yield store - store.close() - - -# === FTS5 Search Tests === - -class TestFTS5BasicSearch: - """Tests for basic FTS5 text search.""" - - def test_single_term_search(self, populated_store): - """Test search with a single term.""" - results = populated_store.search_fts("authenticate") - assert len(results) >= 1 - assert any("auth" in r.path.lower() for r in results) - - def test_case_insensitive_search(self, populated_store): - """Test that search is case insensitive.""" - results_lower = populated_store.search_fts("database") - results_upper = populated_store.search_fts("DATABASE") - results_mixed = populated_store.search_fts("DataBase") - - # All should return similar results - assert len(results_lower) == len(results_upper) == len(results_mixed) - - def test_partial_word_search(self, populated_store): - """Test search with partial words using wildcards.""" - results = populated_store.search_fts("auth*") - assert len(results) >= 1 - # Should match authenticate, authentication, AuthManager, etc. - - def test_multiple_terms_search(self, populated_store): - """Test search with multiple terms (implicit AND).""" - results = populated_store.search_fts("user password") - assert len(results) >= 1 - - def test_no_results_search(self, populated_store): - """Test search that returns no results.""" - results = populated_store.search_fts("nonexistent_xyz_term") - assert len(results) == 0 - - def test_search_with_limit(self, populated_store): - """Test search respects limit parameter.""" - results = populated_store.search_fts("def", limit=1) - assert len(results) <= 1 - - def test_search_returns_excerpt(self, populated_store): - """Test search results include excerpts.""" - results = populated_store.search_fts("authenticate") - assert len(results) >= 1 - # SearchResult should have excerpt field - for r in results: - assert hasattr(r, 'excerpt') - - -class TestFTS5AdvancedSearch: - """Tests for advanced FTS5 search features.""" - - def test_phrase_search(self, populated_store): - """Test exact phrase search with quotes.""" - results = populated_store.search_fts('"verify_token"') - assert len(results) >= 1 - - def test_boolean_or_search(self, populated_store): - """Test OR boolean search.""" - results = populated_store.search_fts("authenticate OR database") - # Should find files containing either term - assert len(results) >= 2 - - def test_boolean_not_search(self, populated_store): - """Test NOT boolean search.""" - all_results = populated_store.search_fts("def") - not_results = populated_store.search_fts("def NOT authenticate") - # NOT should return fewer results - assert len(not_results) <= len(all_results) - - def test_prefix_search(self, populated_store): - """Test prefix search with asterisk.""" - results = populated_store.search_fts("connect*") - assert len(results) >= 1 - # Should match connect, connection, etc. - - def test_special_characters_in_query(self, populated_store): - """Test search handles special characters gracefully.""" - # Should not raise an error - results = populated_store.search_fts("__init__") - # May or may not have results, but shouldn't crash - - def test_unicode_search(self, temp_dir): - """Test search with unicode content.""" - store = SQLiteStore(temp_dir / "_index.db") - store.initialize() - - indexed_file = IndexedFile( - path="/test/unicode.py", - language="python", - symbols=[Symbol(name="世界", kind="function", range=(1, 1))], - ) - store.add_file(indexed_file, "def 世界(): return '你好世界'") - - results = store.search_fts("世界") - assert len(results) == 1 - - store.close() - - -class TestFTS5Pagination: - """Tests for FTS5 search pagination.""" - - def test_offset_pagination(self, temp_dir): - """Test search with offset for pagination.""" - store = SQLiteStore(temp_dir / "_index.db") - store.initialize() - - # Add multiple files - for i in range(10): - indexed_file = IndexedFile( - path=f"/test/file{i}.py", - language="python", - symbols=[], - ) - store.add_file(indexed_file, f"searchable content number {i}") - - page1 = store.search_fts("searchable", limit=3, offset=0) - page2 = store.search_fts("searchable", limit=3, offset=3) - page3 = store.search_fts("searchable", limit=3, offset=6) - - # Each page should have different results - paths1 = {r.path for r in page1} - paths2 = {r.path for r in page2} - paths3 = {r.path for r in page3} - - assert paths1.isdisjoint(paths2) - assert paths2.isdisjoint(paths3) - - store.close() - - def test_offset_beyond_results(self, populated_store): - """Test offset beyond available results.""" - results = populated_store.search_fts("authenticate", limit=10, offset=1000) - assert len(results) == 0 - - -# === Symbol Search Tests === - -class TestSymbolSearch: - """Tests for symbol search functionality.""" - - def test_search_by_name(self, populated_store): - """Test symbol search by name.""" - results = populated_store.search_symbols("auth") - assert len(results) >= 1 - assert any("auth" in s.name.lower() for s in results) - - def test_search_by_kind_function(self, populated_store): - """Test symbol search filtered by kind=function.""" - results = populated_store.search_symbols("", kind="function") - assert all(s.kind == "function" for s in results) - - def test_search_by_kind_class(self, populated_store): - """Test symbol search filtered by kind=class.""" - results = populated_store.search_symbols("", kind="class") - assert all(s.kind == "class" for s in results) - assert any("Manager" in s.name or "Pool" in s.name for s in results) - - def test_search_symbols_with_limit(self, populated_store): - """Test symbol search respects limit.""" - results = populated_store.search_symbols("", limit=2) - assert len(results) <= 2 - - def test_search_symbols_returns_range(self, populated_store): - """Test symbol search results include line range.""" - results = populated_store.search_symbols("authenticate") - assert len(results) >= 1 - for sym in results: - assert hasattr(sym, 'range') - assert len(sym.range) == 2 - assert sym.range[0] <= sym.range[1] - - -# === Chain Search Tests === - -class TestChainSearchEngine: - """Tests for ChainSearchEngine.""" - - @pytest.fixture - def mock_registry(self): - """Create a mock registry.""" - registry = MagicMock(spec=RegistryStore) - registry.find_nearest_index.return_value = None - return registry - - @pytest.fixture - def mock_mapper(self): - """Create a mock path mapper.""" - return MagicMock(spec=PathMapper) - - def test_search_no_index_found(self, mock_registry, mock_mapper): - """Test search when no index is found.""" - mock_mapper.source_to_index_db.return_value = Path("/nonexistent/_index.db") - - engine = ChainSearchEngine(mock_registry, mock_mapper) - result = engine.search("test", Path("/nonexistent")) - - assert result.results == [] - assert result.symbols == [] - assert result.stats.dirs_searched == 0 - - def test_search_options_depth(self, mock_registry, mock_mapper, temp_dir): - """Test search respects depth option.""" - # Create a simple index structure - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - store.add_file( - name="test.py", - full_path=str(temp_dir / "test.py"), - content="test content searchable", - language="python", - ) - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - options = SearchOptions(depth=0) # Only current dir - - result = engine.search("test", temp_dir, options) - - # With depth=0, should only search current directory - assert result.stats.dirs_searched <= 1 - - def test_search_files_only(self, mock_registry, mock_mapper, temp_dir): - """Test search_files_only returns only paths.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - store.add_file( - name="test.py", - full_path=str(temp_dir / "test.py"), - content="searchable content here", - language="python", - ) - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - paths = engine.search_files_only("searchable", temp_dir) - - assert isinstance(paths, list) - for p in paths: - assert isinstance(p, str) - - def test_search_symbols_engine(self, mock_registry, mock_mapper, temp_dir): - """Test symbol search through engine.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - store.add_file( - name="test.py", - full_path=str(temp_dir / "test.py"), - content="def my_function(): pass", - language="python", - symbols=[Symbol(name="my_function", kind="function", range=(1, 5))], - ) - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - symbols = engine.search_symbols("my_func", temp_dir) - - assert len(symbols) >= 1 - assert symbols[0].name == "my_function" - - def test_search_result_stats(self, mock_registry, mock_mapper, temp_dir): - """Test search result includes proper stats.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - store.add_file( - name="test.py", - full_path=str(temp_dir / "test.py"), - content="content to search", - language="python", - ) - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - result = engine.search("content", temp_dir) - - assert result.stats.time_ms >= 0 - assert result.stats.dirs_searched >= 0 - assert isinstance(result.stats.errors, list) - - -class TestSearchOptions: - """Tests for SearchOptions configuration.""" - - def test_default_options(self): - """Test default search options.""" - options = SearchOptions() - assert options.depth == -1 - assert options.max_workers == 8 - assert options.limit_per_dir == 10 - assert options.total_limit == 100 - assert options.include_symbols is False - assert options.files_only is False - - def test_custom_options(self): - """Test custom search options.""" - options = SearchOptions( - depth=3, - max_workers=4, - limit_per_dir=5, - total_limit=50, - include_symbols=True, - files_only=True, - ) - assert options.depth == 3 - assert options.max_workers == 4 - assert options.limit_per_dir == 5 - assert options.total_limit == 50 - assert options.include_symbols is True - assert options.files_only is True - - -# === Edge Cases and Error Handling === - -class TestSearchEdgeCases: - """Edge case tests for search functionality.""" - - def test_empty_query(self, populated_store): - """Test search with empty query.""" - # Empty query may raise an error or return empty results - try: - results = populated_store.search_fts("") - assert isinstance(results, list) - except Exception: - # Some implementations may reject empty queries - pass - - def test_whitespace_query(self, populated_store): - """Test search with whitespace-only query.""" - # Whitespace query may raise an error or return empty results - try: - results = populated_store.search_fts(" ") - assert isinstance(results, list) - except Exception: - # Some implementations may reject whitespace queries - pass - - def test_very_long_query(self, populated_store): - """Test search with very long query.""" - long_query = "function " * 100 # Repeat valid word - try: - results = populated_store.search_fts(long_query) - assert isinstance(results, list) - except Exception: - # Very long queries may be rejected - pass - - def test_special_sql_characters(self, populated_store): - """Test search handles SQL-like characters safely.""" - # These should not cause SQL injection - may raise FTS syntax errors - queries = ["test", "function*", "test OR data"] - for q in queries: - results = populated_store.search_fts(q) - assert isinstance(results, list) - - def test_search_reopened_store(self, temp_dir, sample_files): - """Test search works after store is reopened.""" - db_path = temp_dir / "_index.db" - store = SQLiteStore(db_path) - store.initialize() - store.add_file(sample_files[0][0], sample_files[0][1]) - store.close() - - # Reopen and search - store2 = SQLiteStore(db_path) - store2.initialize() - results = store2.search_fts("authenticate") - assert len(results) >= 1 - store2.close() - - def test_concurrent_searches(self, populated_store): - """Test multiple concurrent searches.""" - import threading - - results = [] - errors = [] - - def search_task(query): - try: - r = populated_store.search_fts(query) - results.append(len(r)) - except Exception as e: - errors.append(e) - - threads = [ - threading.Thread(target=search_task, args=("authenticate",)), - threading.Thread(target=search_task, args=("database",)), - threading.Thread(target=search_task, args=("password",)), - ] - - for t in threads: - t.start() - for t in threads: - t.join() - - assert len(errors) == 0 - assert len(results) == 3 - - -class TestChainSearchResult: - """Tests for ChainSearchResult dataclass.""" - - def test_result_structure(self): - """Test ChainSearchResult has all required fields.""" - result = ChainSearchResult( - query="test", - results=[], - symbols=[], - stats=SearchStats(), - ) - assert result.query == "test" - assert result.results == [] - assert result.related_results == [] - assert result.symbols == [] - assert result.stats.dirs_searched == 0 - - -class TestSearchStats: - """Tests for SearchStats dataclass.""" - - def test_default_stats(self): - """Test default search stats.""" - stats = SearchStats() - assert stats.dirs_searched == 0 - assert stats.files_matched == 0 - assert stats.time_ms == 0 - assert stats.errors == [] - - def test_stats_with_errors(self): - """Test search stats with errors.""" - stats = SearchStats(errors=["Error 1", "Error 2"]) - assert len(stats.errors) == 2 diff --git a/codex-lens/tests/test_search_full_coverage.py b/codex-lens/tests/test_search_full_coverage.py deleted file mode 100644 index fa90ef82..00000000 --- a/codex-lens/tests/test_search_full_coverage.py +++ /dev/null @@ -1,1267 +0,0 @@ -"""Full coverage tests for CodexLens search functionality. - -Comprehensive test suite covering: -- Chain search engine internals -- Multi-directory hierarchical search -- Result merging and deduplication -- Context manager behavior -- Semantic search integration -- Edge cases and error recovery -- Parallel search stress tests -- Boundary conditions -""" - -import tempfile -import pytest -import threading -import time -from pathlib import Path -from unittest.mock import MagicMock, patch, PropertyMock -from concurrent.futures import ThreadPoolExecutor - -from codexlens.storage.sqlite_store import SQLiteStore -from codexlens.storage.dir_index import DirIndexStore -from codexlens.storage.registry import RegistryStore -from codexlens.storage.path_mapper import PathMapper -from codexlens.search import ( - ChainSearchEngine, - SearchOptions, - SearchStats, - ChainSearchResult, - quick_search, -) -from codexlens.entities import IndexedFile, Symbol, SearchResult - - -# === Fixtures === - -@pytest.fixture -def temp_dir(): - """Create a temporary directory.""" - with tempfile.TemporaryDirectory() as tmpdir: - yield Path(tmpdir) - - -@pytest.fixture -def mock_registry(): - """Create a mock registry.""" - registry = MagicMock(spec=RegistryStore) - registry.find_nearest_index.return_value = None - return registry - - -@pytest.fixture -def mock_mapper(): - """Create a mock path mapper.""" - return MagicMock(spec=PathMapper) - - -@pytest.fixture -def sample_code_files(): - """Sample code file data for comprehensive testing.""" - return [ - # Authentication module - { - "name": "auth.py", - "language": "python", - "content": """ -def authenticate(username, password): - '''Authenticate user with credentials.''' - user = find_user(username) - if user and check_password(user, password): - return create_token(user) - return None - -def verify_token(token): - '''Verify JWT token validity.''' - try: - payload = decode_token(token) - return payload - except TokenExpired: - return None - -class AuthManager: - '''Manages authentication state.''' - def __init__(self): - self.sessions = {} - - def login(self, user): - token = authenticate(user.name, user.password) - self.sessions[user.id] = token - return token -""", - "symbols": [ - Symbol(name="authenticate", kind="function", range=(2, 8)), - Symbol(name="verify_token", kind="function", range=(10, 17)), - Symbol(name="AuthManager", kind="class", range=(19, 28)), - ], - }, - # Database module - { - "name": "database.py", - "language": "python", - "content": """ -def connect(host, port, database): - '''Establish database connection.''' - return Connection(host, port, database) - -def query(connection, sql, params=None): - '''Execute SQL query and return results.''' - cursor = connection.cursor() - cursor.execute(sql, params or []) - return cursor.fetchall() - -class DatabasePool: - '''Connection pool for database.''' - def __init__(self, size=10): - self.pool = [] - self.size = size - - def get_connection(self): - if self.pool: - return self.pool.pop() - return connect() -""", - "symbols": [ - Symbol(name="connect", kind="function", range=(2, 4)), - Symbol(name="query", kind="function", range=(6, 10)), - Symbol(name="DatabasePool", kind="class", range=(12, 21)), - ], - }, - # Utils module - { - "name": "utils.py", - "language": "python", - "content": """ -def format_date(date, fmt='%Y-%m-%d'): - return date.strftime(fmt) - -def parse_json(data): - '''Parse JSON string to dictionary.''' - import json - return json.loads(data) - -def hash_password(password, salt=None): - '''Hash password using bcrypt.''' - import hashlib - salt = salt or generate_salt() - return hashlib.sha256((password + salt).encode()).hexdigest() -""", - "symbols": [ - Symbol(name="format_date", kind="function", range=(2, 3)), - Symbol(name="parse_json", kind="function", range=(5, 8)), - Symbol(name="hash_password", kind="function", range=(10, 14)), - ], - }, - ] - - -@pytest.fixture -def populated_single_store(temp_dir, sample_code_files): - """Create a single populated DirIndexStore.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - - for file_data in sample_code_files: - store.add_file( - name=file_data["name"], - full_path=str(temp_dir / file_data["name"]), - content=file_data["content"], - language=file_data["language"], - symbols=file_data["symbols"], - ) - - yield store - store.close() - - -@pytest.fixture -def hierarchical_index_structure(temp_dir, sample_code_files): - """Create a multi-level directory index structure for chain search testing. - - Structure: - project/ - _index.db (root) - src/ - _index.db - auth/ - _index.db - db/ - _index.db - tests/ - _index.db - """ - structure = {} - - # Root directory - root_dir = temp_dir / "project" - root_dir.mkdir() - root_db = root_dir / "_index.db" - root_store = DirIndexStore(root_db) - root_store.initialize() - root_store.add_file( - name="main.py", - full_path=str(root_dir / "main.py"), - content="# Main entry point\nfrom src import auth, db\ndef main(): pass", - language="python", - symbols=[Symbol(name="main", kind="function", range=(3, 3))], - ) - structure["root"] = {"path": root_dir, "db": root_db, "store": root_store} - - # src directory - src_dir = root_dir / "src" - src_dir.mkdir() - src_db = src_dir / "_index.db" - src_store = DirIndexStore(src_db) - src_store.initialize() - src_store.add_file( - name="__init__.py", - full_path=str(src_dir / "__init__.py"), - content="# Source package\nfrom .auth import authenticate\nfrom .db import connect", - language="python", - ) - structure["src"] = {"path": src_dir, "db": src_db, "store": src_store} - - # src/auth directory - auth_dir = src_dir / "auth" - auth_dir.mkdir() - auth_db = auth_dir / "_index.db" - auth_store = DirIndexStore(auth_db) - auth_store.initialize() - auth_store.add_file( - name="auth.py", - full_path=str(auth_dir / "auth.py"), - content=sample_code_files[0]["content"], - language="python", - symbols=sample_code_files[0]["symbols"], - ) - structure["auth"] = {"path": auth_dir, "db": auth_db, "store": auth_store} - - # src/db directory - db_dir = src_dir / "db" - db_dir.mkdir() - db_db = db_dir / "_index.db" - db_store = DirIndexStore(db_db) - db_store.initialize() - db_store.add_file( - name="database.py", - full_path=str(db_dir / "database.py"), - content=sample_code_files[1]["content"], - language="python", - symbols=sample_code_files[1]["symbols"], - ) - structure["db"] = {"path": db_dir, "db": db_db, "store": db_store} - - # tests directory - tests_dir = root_dir / "tests" - tests_dir.mkdir() - tests_db = tests_dir / "_index.db" - tests_store = DirIndexStore(tests_db) - tests_store.initialize() - tests_store.add_file( - name="test_auth.py", - full_path=str(tests_dir / "test_auth.py"), - content="import pytest\nfrom src.auth import authenticate\ndef test_authenticate(): assert authenticate('user', 'pass')", - language="python", - symbols=[Symbol(name="test_authenticate", kind="function", range=(3, 3))], - ) - structure["tests"] = {"path": tests_dir, "db": tests_db, "store": tests_store} - - # Link subdirectories - root_store.register_subdir(name="src", index_path=src_db) - root_store.register_subdir(name="tests", index_path=tests_db) - src_store.register_subdir(name="auth", index_path=auth_db) - src_store.register_subdir(name="db", index_path=db_db) - - # Close all stores before yielding to avoid Windows file locking issues - root_store.close() - src_store.close() - auth_store.close() - db_store.close() - tests_store.close() - - yield structure - - -# === Chain Search Engine Internal Tests === - -class TestChainSearchEngineInternals: - """Tests for ChainSearchEngine internal methods.""" - - def test_context_manager_enter_exit(self, mock_registry, mock_mapper): - """Test context manager protocol.""" - with ChainSearchEngine(mock_registry, mock_mapper) as engine: - assert engine is not None - assert isinstance(engine, ChainSearchEngine) - # Engine should be closed after exit - - def test_close_without_executor(self, mock_registry, mock_mapper): - """Test close() when executor was never created.""" - engine = ChainSearchEngine(mock_registry, mock_mapper) - engine.close() # Should not raise - - def test_close_with_executor(self, mock_registry, mock_mapper, temp_dir): - """Test close() properly shuts down executor.""" - # Create index - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - store.add_file( - name="test.py", - full_path=str(temp_dir / "test.py"), - content="test content searchable", - language="python", - ) - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - # Trigger executor creation - engine.search("test", temp_dir) - - # Close should work - engine.close() - assert engine._executor is None - - def test_get_executor_lazy_initialization(self, mock_registry, mock_mapper): - """Test executor is lazily initialized.""" - engine = ChainSearchEngine(mock_registry, mock_mapper) - assert engine._executor is None - - executor = engine._get_executor() - assert executor is not None - assert engine._executor is executor - - # Second call returns same instance - assert engine._get_executor() is executor - - engine.close() - - def test_get_executor_custom_workers(self, mock_registry, mock_mapper): - """Test executor with custom worker count.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, max_workers=4) - executor = engine._get_executor() - assert executor is not None - engine.close() - - -class TestIndexPathCollection: - """Tests for _collect_index_paths method.""" - - def test_collect_depth_zero(self, mock_registry, mock_mapper, hierarchical_index_structure): - """Test collection with depth=0 returns only start index.""" - structure = hierarchical_index_structure - root_db = structure["root"]["db"] - - mock_mapper.source_to_index_db.return_value = root_db - - engine = ChainSearchEngine(mock_registry, mock_mapper) - paths = engine._collect_index_paths(root_db, depth=0) - - assert len(paths) == 1 - assert paths[0] == root_db.resolve() - engine.close() - - def test_collect_depth_one(self, mock_registry, mock_mapper, hierarchical_index_structure): - """Test collection with depth=1 returns root + immediate children.""" - structure = hierarchical_index_structure - root_db = structure["root"]["db"] - - mock_mapper.source_to_index_db.return_value = root_db - - engine = ChainSearchEngine(mock_registry, mock_mapper) - paths = engine._collect_index_paths(root_db, depth=1) - - # Should include root, src, tests (not auth/db which are depth 2) - assert len(paths) == 3 - engine.close() - - def test_collect_depth_unlimited(self, mock_registry, mock_mapper, hierarchical_index_structure): - """Test collection with depth=-1 returns all indexes.""" - structure = hierarchical_index_structure - root_db = structure["root"]["db"] - - mock_mapper.source_to_index_db.return_value = root_db - - engine = ChainSearchEngine(mock_registry, mock_mapper) - paths = engine._collect_index_paths(root_db, depth=-1) - - # Should include all 5: root, src, tests, auth, db - assert len(paths) == 5 - engine.close() - - def test_collect_avoids_duplicates(self, mock_registry, mock_mapper, hierarchical_index_structure): - """Test collection deduplicates paths.""" - structure = hierarchical_index_structure - root_db = structure["root"]["db"] - - engine = ChainSearchEngine(mock_registry, mock_mapper) - paths = engine._collect_index_paths(root_db, depth=-1) - - # All paths should be unique - path_set = set(str(p) for p in paths) - assert len(path_set) == len(paths) - engine.close() - - def test_collect_handles_missing_subdir_index(self, mock_registry, mock_mapper, temp_dir): - """Test collection handles missing subdirectory indexes gracefully.""" - # Create root with reference to non-existent subdir - root_db = temp_dir / "_index.db" - store = DirIndexStore(root_db) - store.initialize() - store.add_file( - name="test.py", - full_path=str(temp_dir / "test.py"), - content="test", - language="python", - ) - # Add reference to non-existent index - store.register_subdir(name="missing", index_path=temp_dir / "missing" / "_index.db") - store.close() - - engine = ChainSearchEngine(mock_registry, mock_mapper) - paths = engine._collect_index_paths(root_db, depth=-1) - - # Should only include root (missing subdir is skipped) - assert len(paths) == 1 - engine.close() - - def test_collect_skips_ignored_artifact_indexes(self, mock_registry, mock_mapper, temp_dir): - """Test collection skips dist/build-style artifact subtrees.""" - root_dir = temp_dir / "project" - root_dir.mkdir() - - root_db = root_dir / "_index.db" - root_store = DirIndexStore(root_db) - root_store.initialize() - - src_dir = root_dir / "src" - src_dir.mkdir() - src_db = src_dir / "_index.db" - src_store = DirIndexStore(src_db) - src_store.initialize() - - dist_dir = root_dir / "dist" - dist_dir.mkdir() - dist_db = dist_dir / "_index.db" - dist_store = DirIndexStore(dist_db) - dist_store.initialize() - - workflow_dir = root_dir / ".workflow" - workflow_dir.mkdir() - workflow_db = workflow_dir / "_index.db" - workflow_store = DirIndexStore(workflow_db) - workflow_store.initialize() - - root_store.register_subdir(name="src", index_path=src_db) - root_store.register_subdir(name="dist", index_path=dist_db) - root_store.register_subdir(name=".workflow", index_path=workflow_db) - - root_store.close() - src_store.close() - dist_store.close() - workflow_store.close() - - engine = ChainSearchEngine(mock_registry, mock_mapper) - paths = engine._collect_index_paths(root_db, depth=-1) - - assert {path.relative_to(root_dir).as_posix() for path in paths} == { - "_index.db", - "src/_index.db", - } - engine.close() - - -class TestResultMergeAndRank: - """Tests for _merge_and_rank method.""" - - def test_merge_deduplicates_by_path(self, mock_registry, mock_mapper): - """Test merging deduplicates results by path.""" - engine = ChainSearchEngine(mock_registry, mock_mapper) - - results = [ - SearchResult(path="/test/file.py", score=10.0, excerpt="match 1"), - SearchResult(path="/test/file.py", score=5.0, excerpt="match 2"), - SearchResult(path="/test/other.py", score=8.0, excerpt="match 3"), - ] - - merged = engine._merge_and_rank(results, limit=10) - - assert len(merged) == 2 - # Should keep highest score for duplicate path - file_result = next(r for r in merged if r.path == "/test/file.py") - assert file_result.score == 10.0 - engine.close() - - def test_merge_sorts_by_score_descending(self, mock_registry, mock_mapper): - """Test merged results are sorted by score descending.""" - engine = ChainSearchEngine(mock_registry, mock_mapper) - - results = [ - SearchResult(path="/test/low.py", score=1.0, excerpt=""), - SearchResult(path="/test/high.py", score=100.0, excerpt=""), - SearchResult(path="/test/mid.py", score=50.0, excerpt=""), - ] - - merged = engine._merge_and_rank(results, limit=10) - - assert merged[0].path == "/test/high.py" - assert merged[1].path == "/test/mid.py" - assert merged[2].path == "/test/low.py" - engine.close() - - def test_merge_respects_limit(self, mock_registry, mock_mapper): - """Test merge respects limit parameter.""" - engine = ChainSearchEngine(mock_registry, mock_mapper) - - results = [ - SearchResult(path=f"/test/file{i}.py", score=float(i), excerpt="") - for i in range(100) - ] - - merged = engine._merge_and_rank(results, limit=5) - - assert len(merged) == 5 - # Should be the top 5 by score - assert merged[0].score == 99.0 - engine.close() - - def test_merge_empty_results(self, mock_registry, mock_mapper): - """Test merge handles empty results.""" - engine = ChainSearchEngine(mock_registry, mock_mapper) - merged = engine._merge_and_rank([], limit=10) - assert merged == [] - engine.close() - - def test_merge_applies_test_file_penalty_for_non_test_query(self, mock_registry, mock_mapper): - """Non-test queries should lightly demote test files during merge.""" - engine = ChainSearchEngine(mock_registry, mock_mapper) - - results = [ - SearchResult(path="/repo/tests/test_auth.py", score=10.0, excerpt="match 1"), - SearchResult(path="/repo/src/auth.py", score=9.0, excerpt="match 2"), - ] - - merged = engine._merge_and_rank(results, limit=10, query="authenticate users") - - assert merged[0].path == "/repo/src/auth.py" - assert merged[1].metadata["path_penalty_reasons"] == ["test_file"] - engine.close() - - def test_merge_applies_generated_file_penalty_for_non_artifact_query(self, mock_registry, mock_mapper): - """Non-artifact queries should lightly demote generated/build results during merge.""" - engine = ChainSearchEngine(mock_registry, mock_mapper) - - results = [ - SearchResult(path="/repo/dist/auth.js", score=10.0, excerpt="match 1"), - SearchResult(path="/repo/src/auth.ts", score=9.0, excerpt="match 2"), - ] - - merged = engine._merge_and_rank(results, limit=10, query="authenticate users") - - assert merged[0].path == "/repo/src/auth.ts" - assert merged[1].metadata["path_penalty_reasons"] == ["generated_artifact"] - engine.close() - - -# === Hierarchical Chain Search Tests === - -class TestHierarchicalChainSearch: - """Tests for searching across directory hierarchies.""" - - def test_search_from_root(self, mock_registry, mock_mapper, hierarchical_index_structure): - """Test search starting from root finds results in all subdirectories.""" - structure = hierarchical_index_structure - root_db = structure["root"]["db"] - root_path = structure["root"]["path"] - - mock_mapper.source_to_index_db.return_value = root_db - - engine = ChainSearchEngine(mock_registry, mock_mapper) - result = engine.search("authenticate", root_path) - - # Should find authenticate in auth.py and test_auth.py - assert len(result.results) >= 1 - assert result.stats.dirs_searched == 5 # All directories - engine.close() - - def test_search_from_subdir(self, mock_registry, mock_mapper, hierarchical_index_structure): - """Test search starting from subdirectory.""" - structure = hierarchical_index_structure - src_db = structure["src"]["db"] - src_path = structure["src"]["path"] - - mock_mapper.source_to_index_db.return_value = src_db - - engine = ChainSearchEngine(mock_registry, mock_mapper) - result = engine.search("authenticate", src_path) - - # Should find only in src subtree (src, auth, db) - assert result.stats.dirs_searched == 3 - engine.close() - - def test_search_with_depth_limit(self, mock_registry, mock_mapper, hierarchical_index_structure): - """Test search respects depth limit.""" - structure = hierarchical_index_structure - root_db = structure["root"]["db"] - root_path = structure["root"]["path"] - - mock_mapper.source_to_index_db.return_value = root_db - - engine = ChainSearchEngine(mock_registry, mock_mapper) - options = SearchOptions(depth=1) - result = engine.search("authenticate", root_path, options) - - # Depth 1: root + immediate children (src, tests) = 3 - assert result.stats.dirs_searched == 3 - engine.close() - - def test_search_aggregates_results(self, mock_registry, mock_mapper, hierarchical_index_structure): - """Test search aggregates results from multiple directories.""" - structure = hierarchical_index_structure - root_db = structure["root"]["db"] - root_path = structure["root"]["path"] - - mock_mapper.source_to_index_db.return_value = root_db - - engine = ChainSearchEngine(mock_registry, mock_mapper) - # Search for term that appears in multiple files - result = engine.search("def", root_path) - - # Should find results from multiple files - assert len(result.results) >= 3 - engine.close() - - -# === Search Files Only Tests === - -class TestSearchFilesOnly: - """Tests for search_files_only method.""" - - def test_returns_list_of_strings(self, mock_registry, mock_mapper, temp_dir): - """Test search_files_only returns list of path strings.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - store.add_file( - name="test.py", - full_path=str(temp_dir / "test.py"), - content="searchable content here", - language="python", - ) - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - paths = engine.search_files_only("searchable", temp_dir) - - assert isinstance(paths, list) - assert all(isinstance(p, str) for p in paths) - engine.close() - - def test_files_only_faster_than_full(self, mock_registry, mock_mapper, temp_dir): - """Test files_only search is at least as fast as full search.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - - # Add multiple files - for i in range(20): - store.add_file( - name=f"file{i}.py", - full_path=str(temp_dir / f"file{i}.py"), - content=f"searchable content number {i} with more text to index", - language="python", - ) - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - - # Time files_only - start = time.perf_counter() - for _ in range(10): - engine.search_files_only("searchable", temp_dir) - files_only_time = time.perf_counter() - start - - # Time full search - start = time.perf_counter() - for _ in range(10): - engine.search("searchable", temp_dir) - full_time = time.perf_counter() - start - - # files_only should not be significantly slower - # (may not be faster due to small dataset) - assert files_only_time <= full_time * 2 - engine.close() - - -# === Symbol Search Tests === - -class TestChainSymbolSearch: - """Tests for chain symbol search.""" - - def test_symbol_search_finds_across_dirs(self, mock_registry, mock_mapper, hierarchical_index_structure): - """Test symbol search finds symbols across directories.""" - structure = hierarchical_index_structure - root_db = structure["root"]["db"] - root_path = structure["root"]["path"] - - mock_mapper.source_to_index_db.return_value = root_db - - engine = ChainSearchEngine(mock_registry, mock_mapper) - symbols = engine.search_symbols("auth", root_path) - - # Should find authenticate and AuthManager - assert len(symbols) >= 2 - engine.close() - - def test_symbol_search_with_kind_filter(self, mock_registry, mock_mapper, hierarchical_index_structure): - """Test symbol search with kind filter.""" - structure = hierarchical_index_structure - root_db = structure["root"]["db"] - root_path = structure["root"]["path"] - - mock_mapper.source_to_index_db.return_value = root_db - - engine = ChainSearchEngine(mock_registry, mock_mapper) - classes = engine.search_symbols("", root_path, kind="class") - - # Should find AuthManager and DatabasePool - assert all(s.kind == "class" for s in classes) - engine.close() - - def test_symbol_search_deduplicates(self, mock_registry, mock_mapper, temp_dir): - """Test symbol search deduplicates by (name, kind, range) but keeps different ranges.""" - # Create two indexes with same symbol name but different ranges - dir1 = temp_dir / "dir1" - dir1.mkdir() - db1 = dir1 / "_index.db" - store1 = DirIndexStore(db1) - store1.initialize() - store1.add_file( - name="a.py", - full_path=str(dir1 / "a.py"), - content="def foo(): pass", - language="python", - symbols=[Symbol(name="foo", kind="function", range=(1, 5))], # Different range - ) - - dir2 = temp_dir / "dir2" - dir2.mkdir() - db2 = dir2 / "_index.db" - store2 = DirIndexStore(db2) - store2.initialize() - store2.add_file( - name="b.py", - full_path=str(dir2 / "b.py"), - content="def foo(): pass\n# more code\n", - language="python", - symbols=[Symbol(name="foo", kind="function", range=(1, 10))], # Different range - ) - store2.close() - - # Register subdir after dir2 is created - store1.register_subdir(name="dir2", index_path=db2) - store1.close() - - mock_mapper.source_to_index_db.return_value = db1 - - engine = ChainSearchEngine(mock_registry, mock_mapper) - symbols = engine.search_symbols("foo", dir1) - - # Should have exactly 2 (different ranges make them unique) - assert len(symbols) == 2 - engine.close() - - -# === Search Options Tests === - -class TestSearchOptionsExtended: - """Extended tests for SearchOptions.""" - - def test_include_semantic_option(self): - """Test include_semantic option.""" - options = SearchOptions(include_semantic=True) - assert options.include_semantic is True - - options_default = SearchOptions() - assert options_default.include_semantic is False - - def test_all_options_combined(self): - """Test all options set together.""" - options = SearchOptions( - depth=5, - max_workers=16, - limit_per_dir=20, - total_limit=200, - include_symbols=True, - files_only=True, - include_semantic=True, - ) - assert options.depth == 5 - assert options.max_workers == 16 - assert options.limit_per_dir == 20 - assert options.total_limit == 200 - assert options.include_symbols is True - assert options.files_only is True - assert options.include_semantic is True - - def test_options_with_zero_values(self): - """Test options with zero values.""" - options = SearchOptions( - depth=0, - max_workers=1, - limit_per_dir=1, - total_limit=1, - ) - assert options.depth == 0 - assert options.max_workers == 1 - assert options.limit_per_dir == 1 - assert options.total_limit == 1 - - -# === Quick Search Tests === - -class TestQuickSearch: - """Tests for quick_search convenience function.""" - - def test_quick_search_returns_results(self, temp_dir): - """Test quick_search returns SearchResult list.""" - # Setup: Create index at a known location - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - store.add_file( - name="test.py", - full_path=str(temp_dir / "test.py"), - content="searchable content for quick search test", - language="python", - ) - store.close() - - # Test requires actual registry - skip if not initialized - try: - results = quick_search("searchable", temp_dir) - assert isinstance(results, list) - except Exception: - # May fail if registry not properly set up - pytest.skip("Registry not available for quick_search test") - - def test_quick_search_with_depth(self, temp_dir): - """Test quick_search respects depth parameter.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - store.add_file( - name="test.py", - full_path=str(temp_dir / "test.py"), - content="test content", - language="python", - ) - store.close() - - try: - results = quick_search("test", temp_dir, depth=0) - assert isinstance(results, list) - except Exception: - pytest.skip("Registry not available for quick_search test") - - -# === Edge Cases and Error Handling === - -class TestSearchErrorHandling: - """Tests for search error handling.""" - - def test_search_corrupted_index(self, mock_registry, mock_mapper, temp_dir): - """Test search handles corrupted index gracefully.""" - # Create corrupted index file - db_path = temp_dir / "_index.db" - db_path.write_text("not a valid sqlite database") - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - try: - result = engine.search("test", temp_dir) - # Should return empty results, not crash - assert result.results == [] - finally: - engine.close() - # Force cleanup on Windows - import gc - gc.collect() - - def test_search_empty_index(self, mock_registry, mock_mapper, temp_dir): - """Test search on empty index returns empty results.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - result = engine.search("anything", temp_dir) - - assert result.results == [] - assert result.stats.files_matched == 0 - engine.close() - - def test_search_special_fts_characters(self, mock_registry, mock_mapper, temp_dir): - """Test search handles FTS5 special characters.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - store.add_file( - name="test.py", - full_path=str(temp_dir / "test.py"), - content="test content", - language="python", - ) - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - - # These should not crash - special_queries = [ - "test*", - "test OR other", - '"exact phrase"', - "NOT invalid", - ] - - for query in special_queries: - result = engine.search(query, temp_dir) - assert isinstance(result.results, list) - - engine.close() - - -# === Concurrent Search Tests === - -class TestConcurrentSearch: - """Tests for concurrent search operations.""" - - def test_multiple_concurrent_searches(self, mock_registry, mock_mapper, temp_dir): - """Test multiple concurrent searches don't interfere.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - for i in range(10): - store.add_file( - name=f"file{i}.py", - full_path=str(temp_dir / f"file{i}.py"), - content=f"content{i} searchable data", - language="python", - ) - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - - results = [] - errors = [] - - def search_task(query): - try: - r = engine.search(query, temp_dir) - results.append(len(r.results)) - except Exception as e: - errors.append(str(e)) - - threads = [ - threading.Thread(target=search_task, args=(f"content{i}",)) - for i in range(5) - ] - - for t in threads: - t.start() - for t in threads: - t.join() - - assert len(errors) == 0 - assert len(results) == 5 - engine.close() - - def test_search_during_close(self, mock_registry, mock_mapper, temp_dir): - """Test behavior when search happens during close.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - store.add_file( - name="test.py", - full_path=str(temp_dir / "test.py"), - content="test content", - language="python", - ) - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - - # Start a search then immediately close - result = engine.search("test", temp_dir) - engine.close() - - # Should complete without error - assert isinstance(result.results, list) - - -# === Search Statistics Tests === - -class TestSearchStatsExtended: - """Extended tests for search statistics.""" - - def test_stats_time_is_positive(self, mock_registry, mock_mapper, temp_dir): - """Test search time is recorded and positive.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - store.add_file( - name="test.py", - full_path=str(temp_dir / "test.py"), - content="test content", - language="python", - ) - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - result = engine.search("test", temp_dir) - - assert result.stats.time_ms >= 0 - engine.close() - - def test_stats_dirs_searched_accurate(self, mock_registry, mock_mapper, hierarchical_index_structure): - """Test dirs_searched count is accurate.""" - structure = hierarchical_index_structure - root_db = structure["root"]["db"] - root_path = structure["root"]["path"] - - mock_mapper.source_to_index_db.return_value = root_db - - engine = ChainSearchEngine(mock_registry, mock_mapper) - - # Depth 0 - result0 = engine.search("test", root_path, SearchOptions(depth=0)) - assert result0.stats.dirs_searched == 1 - - # Depth 1 - result1 = engine.search("test", root_path, SearchOptions(depth=1)) - assert result1.stats.dirs_searched == 3 # root + src + tests - - # Unlimited - result_all = engine.search("test", root_path, SearchOptions(depth=-1)) - assert result_all.stats.dirs_searched == 5 - - engine.close() - - def test_stats_files_matched_accurate(self, mock_registry, mock_mapper, temp_dir): - """Test files_matched count is accurate.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - - # Add files with different content - store.add_file(name="match1.py", full_path=str(temp_dir / "match1.py"), - content="findme keyword", language="python") - store.add_file(name="match2.py", full_path=str(temp_dir / "match2.py"), - content="findme keyword", language="python") - store.add_file(name="nomatch.py", full_path=str(temp_dir / "nomatch.py"), - content="other content", language="python") - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - result = engine.search("findme", temp_dir) - - assert result.stats.files_matched == 2 - engine.close() - - -# === Boundary Condition Tests === - -class TestBoundaryConditions: - """Tests for boundary conditions.""" - - def test_search_with_max_workers_one(self, mock_registry, mock_mapper, temp_dir): - """Test search with single worker.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - store.add_file(name="test.py", full_path=str(temp_dir / "test.py"), - content="test content", language="python") - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper, max_workers=1) - result = engine.search("test", temp_dir, SearchOptions(max_workers=1)) - - assert isinstance(result.results, list) - engine.close() - - def test_search_with_limit_one(self, mock_registry, mock_mapper, temp_dir): - """Test search with limit=1.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - for i in range(10): - store.add_file(name=f"file{i}.py", full_path=str(temp_dir / f"file{i}.py"), - content="searchable content", language="python") - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - result = engine.search("searchable", temp_dir, SearchOptions(total_limit=1)) - - assert len(result.results) <= 1 - engine.close() - - def test_search_very_long_query(self, mock_registry, mock_mapper, temp_dir): - """Test search with very long query.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - store.add_file(name="test.py", full_path=str(temp_dir / "test.py"), - content="test content", language="python") - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - - # Very long query - long_query = " ".join(["word"] * 100) - result = engine.search(long_query, temp_dir) - - # Should not crash - assert isinstance(result.results, list) - engine.close() - - def test_search_unicode_query(self, mock_registry, mock_mapper, temp_dir): - """Test search with unicode query does not crash.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - store.add_file( - name="unicode.py", - full_path=str(temp_dir / "unicode.py"), - content="# Chinese comment\ndef hello(): return 'hello world'", - language="python", - ) - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - # Unicode query should not crash (may or may not find results depending on FTS5 tokenizer) - result = engine.search("hello", temp_dir) - - assert isinstance(result.results, list) - assert len(result.results) >= 1 - engine.close() - - def test_search_empty_directory(self, mock_registry, mock_mapper, temp_dir): - """Test search in directory with no files.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - # Don't add any files - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - result = engine.search("anything", temp_dir) - - assert result.results == [] - assert result.stats.files_matched == 0 - engine.close() - - -# === Include Symbols Option Tests === - -class TestIncludeSymbolsOption: - """Tests for include_symbols search option.""" - - def test_search_with_include_symbols(self, mock_registry, mock_mapper, temp_dir): - """Test search returns symbols when include_symbols=True.""" - db_path = temp_dir / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - store.add_file( - name="test.py", - full_path=str(temp_dir / "test.py"), - content="def my_function(): pass", - language="python", - symbols=[Symbol(name="my_function", kind="function", range=(1, 1))], - ) - store.close() - - mock_mapper.source_to_index_db.return_value = db_path - - engine = ChainSearchEngine(mock_registry, mock_mapper) - - # Without include_symbols - result_no_symbols = engine.search("function", temp_dir, SearchOptions(include_symbols=False)) - assert result_no_symbols.symbols == [] - - # With include_symbols - result_with_symbols = engine.search("function", temp_dir, SearchOptions(include_symbols=True)) - # Symbols list populated (may or may not match depending on implementation) - assert isinstance(result_with_symbols.symbols, list) - - engine.close() - - -# === ChainSearchResult Tests === - -class TestChainSearchResultExtended: - """Extended tests for ChainSearchResult dataclass.""" - - def test_result_immutability(self): - """Test ChainSearchResult fields.""" - stats = SearchStats(dirs_searched=5, files_matched=10, time_ms=100.5) - results = [SearchResult(path="/test.py", score=1.0, excerpt="test")] - symbols = [Symbol(name="foo", kind="function", range=(1, 5))] - - result = ChainSearchResult( - query="test query", - results=results, - symbols=symbols, - stats=stats, - ) - - assert result.query == "test query" - assert len(result.results) == 1 - assert len(result.symbols) == 1 - assert result.related_results == [] - assert result.stats.dirs_searched == 5 - - def test_result_with_empty_collections(self): - """Test ChainSearchResult with empty results and symbols.""" - result = ChainSearchResult( - query="no matches", - results=[], - symbols=[], - stats=SearchStats(), - ) - - assert result.query == "no matches" - assert result.results == [] - assert result.related_results == [] - assert result.symbols == [] - assert result.stats.dirs_searched == 0 diff --git a/codex-lens/tests/test_search_performance.py b/codex-lens/tests/test_search_performance.py deleted file mode 100644 index 5460efb5..00000000 --- a/codex-lens/tests/test_search_performance.py +++ /dev/null @@ -1,660 +0,0 @@ -"""Performance benchmarks for CodexLens search functionality. - -Measures: -- FTS5 search speed at various scales -- Chain search traversal performance -- Semantic search latency -- Memory usage during search operations -""" - -import gc -import sys -import tempfile -import time -from pathlib import Path -from typing import List, Tuple -from dataclasses import dataclass -from contextlib import contextmanager - -import pytest - -from codexlens.storage.sqlite_store import SQLiteStore -from codexlens.storage.dir_index import DirIndexStore -from codexlens.storage.registry import RegistryStore -from codexlens.storage.path_mapper import PathMapper -from codexlens.search import ChainSearchEngine, SearchOptions -from codexlens.entities import IndexedFile, Symbol - - -@dataclass -class BenchmarkResult: - """Benchmark result container.""" - name: str - iterations: int - total_time_ms: float - avg_time_ms: float - min_time_ms: float - max_time_ms: float - ops_per_sec: float - - def __str__(self): - return ( - f"{self.name}:\n" - f" Iterations: {self.iterations}\n" - f" Total: {self.total_time_ms:.2f}ms\n" - f" Avg: {self.avg_time_ms:.2f}ms\n" - f" Min: {self.min_time_ms:.2f}ms\n" - f" Max: {self.max_time_ms:.2f}ms\n" - f" Ops/sec: {self.ops_per_sec:.1f}" - ) - - -def benchmark(func, iterations=10, warmup=2): - """Run benchmark with warmup iterations.""" - # Warmup - for _ in range(warmup): - func() - - # Measure - times = [] - for _ in range(iterations): - gc.collect() - start = time.perf_counter() - func() - elapsed = (time.perf_counter() - start) * 1000 - times.append(elapsed) - - total = sum(times) - return BenchmarkResult( - name=func.__name__ if hasattr(func, '__name__') else 'benchmark', - iterations=iterations, - total_time_ms=total, - avg_time_ms=total / iterations, - min_time_ms=min(times), - max_time_ms=max(times), - ops_per_sec=1000 / (total / iterations) if total > 0 else 0 - ) - - -@contextmanager -def timer(name: str): - """Context manager for timing code blocks.""" - start = time.perf_counter() - yield - elapsed = (time.perf_counter() - start) * 1000 - print(f" {name}: {elapsed:.2f}ms") - - -# === Test Fixtures === - -@pytest.fixture(scope="module") -def temp_dir(): - """Create a temporary directory for all tests.""" - tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) - yield Path(tmpdir.name) - # Explicit cleanup with error handling for Windows file locking - try: - tmpdir.cleanup() - except (PermissionError, OSError): - pass # Ignore Windows file locking errors - - -def generate_code_file(index: int, lines: int = 100) -> Tuple[IndexedFile, str]: - """Generate a synthetic code file for testing.""" - symbols = [ - Symbol(name=f"function_{index}_{i}", kind="function", range=(i*10+1, i*10+9)) - for i in range(lines // 10) - ] - - content_lines = [] - for i in range(lines): - if i % 10 == 0: - content_lines.append(f"def function_{index}_{i//10}(param_{i}, data_{i}):") - else: - content_lines.append(f" # Line {i}: processing data with param_{i % 5}") - content_lines.append(f" result_{i} = compute(data_{i})") - - return ( - IndexedFile( - path=f"/project/src/module_{index}/file_{index}.py", - language="python", - symbols=symbols, - ), - "\n".join(content_lines) - ) - - -@pytest.fixture(scope="module") -def small_store(temp_dir): - """Small store with 10 files (~100 lines each).""" - db_path = temp_dir / "small_index.db" - store = SQLiteStore(db_path) - store.initialize() - - for i in range(10): - indexed_file, content = generate_code_file(i, lines=100) - store.add_file(indexed_file, content) - - yield store - store.close() - - -@pytest.fixture(scope="module") -def medium_store(temp_dir): - """Medium store with 100 files (~100 lines each).""" - db_path = temp_dir / "medium_index.db" - store = SQLiteStore(db_path) - store.initialize() - - for i in range(100): - indexed_file, content = generate_code_file(i, lines=100) - store.add_file(indexed_file, content) - - yield store - store.close() - - -@pytest.fixture(scope="module") -def large_store(temp_dir): - """Large store with 500 files (~200 lines each).""" - db_path = temp_dir / "large_index.db" - store = SQLiteStore(db_path) - store.initialize() - - for i in range(500): - indexed_file, content = generate_code_file(i, lines=200) - store.add_file(indexed_file, content) - - yield store - store.close() - - -# === FTS5 Performance Tests === - -class TestFTS5Performance: - """FTS5 search performance benchmarks.""" - - def test_small_store_search(self, small_store): - """Benchmark FTS5 search on small store (10 files).""" - print("\n" + "="*60) - print("FTS5 SEARCH - SMALL STORE (10 files)") - print("="*60) - - queries = ["function", "data", "compute", "result", "param"] - - for query in queries: - result = benchmark( - lambda q=query: small_store.search_fts(q, limit=20), - iterations=50 - ) - result.name = f"search '{query}'" - print(f"\n{result}") - - def test_medium_store_search(self, medium_store): - """Benchmark FTS5 search on medium store (100 files).""" - print("\n" + "="*60) - print("FTS5 SEARCH - MEDIUM STORE (100 files)") - print("="*60) - - queries = ["function", "data", "compute", "result", "param"] - - for query in queries: - result = benchmark( - lambda q=query: medium_store.search_fts(q, limit=20), - iterations=30 - ) - result.name = f"search '{query}'" - print(f"\n{result}") - - def test_large_store_search(self, large_store): - """Benchmark FTS5 search on large store (500 files).""" - print("\n" + "="*60) - print("FTS5 SEARCH - LARGE STORE (500 files)") - print("="*60) - - queries = ["function", "data", "compute", "result", "param"] - - for query in queries: - result = benchmark( - lambda q=query: large_store.search_fts(q, limit=20), - iterations=20 - ) - result.name = f"search '{query}'" - print(f"\n{result}") - - def test_search_limit_scaling(self, medium_store): - """Test how search time scales with result limit.""" - print("\n" + "="*60) - print("FTS5 SEARCH - LIMIT SCALING") - print("="*60) - - limits = [5, 10, 20, 50, 100, 200] - - for limit in limits: - result = benchmark( - lambda l=limit: medium_store.search_fts("function", limit=l), - iterations=20 - ) - result.name = f"limit={limit}" - print(f"\n{result}") - - def test_complex_query_performance(self, medium_store): - """Test performance of complex FTS5 queries.""" - print("\n" + "="*60) - print("FTS5 SEARCH - COMPLEX QUERIES") - print("="*60) - - queries = [ - ("single term", "function"), - ("two terms", "function data"), - ("phrase", '"def function"'), - ("OR query", "function OR result"), - ("wildcard", "func*"), - ("NOT query", "function NOT data"), - ] - - for name, query in queries: - result = benchmark( - lambda q=query: medium_store.search_fts(q, limit=20), - iterations=20 - ) - result.name = name - print(f"\n{result}") - - -class TestSymbolSearchPerformance: - """Symbol search performance benchmarks.""" - - def test_symbol_search_scaling(self, small_store, medium_store, large_store): - """Test symbol search performance at different scales.""" - print("\n" + "="*60) - print("SYMBOL SEARCH - SCALING") - print("="*60) - - stores = [ - ("small (10 files)", small_store), - ("medium (100 files)", medium_store), - ("large (500 files)", large_store), - ] - - for name, store in stores: - result = benchmark( - lambda s=store: s.search_symbols("function", limit=50), - iterations=20 - ) - result.name = name - print(f"\n{result}") - - def test_symbol_search_with_kind_filter(self, medium_store): - """Test symbol search with kind filtering.""" - print("\n" + "="*60) - print("SYMBOL SEARCH - KIND FILTER") - print("="*60) - - # Without filter - result_no_filter = benchmark( - lambda: medium_store.search_symbols("function", limit=50), - iterations=20 - ) - result_no_filter.name = "no filter" - print(f"\n{result_no_filter}") - - # With filter - result_with_filter = benchmark( - lambda: medium_store.search_symbols("function", kind="function", limit=50), - iterations=20 - ) - result_with_filter.name = "kind=function" - print(f"\n{result_with_filter}") - - -# === Chain Search Performance Tests === - -class TestChainSearchPerformance: - """Chain search engine performance benchmarks.""" - - @pytest.fixture - def chain_engine_setup(self, temp_dir): - """Setup chain search engine with directory hierarchy.""" - # Create directory hierarchy - root = temp_dir / "project" - root.mkdir(exist_ok=True) - - registry = RegistryStore(temp_dir / "registry.db") - registry.initialize() - mapper = PathMapper(temp_dir / "indexes") - - # Create indexes at different depths - dirs = [ - root, - root / "src", - root / "src" / "core", - root / "src" / "utils", - root / "tests", - ] - - for i, dir_path in enumerate(dirs): - dir_path.mkdir(exist_ok=True) - index_path = mapper.source_to_index_db(dir_path) - index_path.parent.mkdir(parents=True, exist_ok=True) - - store = DirIndexStore(index_path) - store.initialize() - for j in range(20): # 20 files per directory - indexed_file, content = generate_code_file(i * 100 + j, lines=50) - file_path = str(dir_path / f"file_{j}.py") - store.add_file( - name=f"file_{j}.py", - full_path=file_path, - content=content, - language="python", - symbols=indexed_file.symbols, - ) - store.close() - - # Register directory - project = registry.register_project(root, mapper.source_to_index_dir(root)) - registry.register_dir(project.id, dir_path, index_path, i, 20) - - engine = ChainSearchEngine(registry, mapper) - - yield { - "engine": engine, - "registry": registry, - "root": root, - } - - registry.close() - - def test_chain_search_depth(self, chain_engine_setup): - """Test chain search at different depths.""" - print("\n" + "="*60) - print("CHAIN SEARCH - DEPTH VARIATION") - print("="*60) - - engine = chain_engine_setup["engine"] - root = chain_engine_setup["root"] - - depths = [0, 1, 2, -1] # -1 = unlimited - - for depth in depths: - options = SearchOptions(depth=depth, max_workers=4, total_limit=50) - result = benchmark( - lambda d=depth, o=options: engine.search("function", root, o), - iterations=10 - ) - result.name = f"depth={depth}" - print(f"\n{result}") - - def test_chain_search_parallelism(self, chain_engine_setup): - """Test chain search with different worker counts.""" - print("\n" + "="*60) - print("CHAIN SEARCH - PARALLELISM") - print("="*60) - - engine = chain_engine_setup["engine"] - root = chain_engine_setup["root"] - - worker_counts = [1, 2, 4, 8] - - for workers in worker_counts: - options = SearchOptions(depth=-1, max_workers=workers, total_limit=50) - result = benchmark( - lambda w=workers, o=options: engine.search("function", root, o), - iterations=10 - ) - result.name = f"workers={workers}" - print(f"\n{result}") - - -# === Semantic Search Performance Tests === - -class TestSemanticSearchPerformance: - """Semantic search performance benchmarks.""" - - @pytest.fixture - def semantic_setup(self, temp_dir): - """Setup semantic search with embeddings.""" - try: - from codexlens.semantic import SEMANTIC_AVAILABLE - if not SEMANTIC_AVAILABLE: - pytest.skip("Semantic search dependencies not installed") - - from codexlens.semantic.embedder import Embedder - from codexlens.semantic.vector_store import VectorStore - from codexlens.entities import SemanticChunk - - embedder = Embedder() - db_path = temp_dir / "semantic.db" - vector_store = VectorStore(db_path) - - # Add test chunks - code_samples = [ - "def authenticate_user(username, password): verify user credentials", - "class DatabaseConnection: manage database connections with pooling", - "async def fetch_api_data(url): make HTTP request and return JSON", - "function renderComponent(props): render React UI component", - "def process_data(input): transform and validate input data", - ] * 50 # 250 chunks - - for i, content in enumerate(code_samples): - chunk = SemanticChunk( - content=content, - metadata={"index": i, "language": "python"} - ) - chunk.embedding = embedder.embed_single(content) - vector_store.add_chunk(chunk, f"/test/file_{i}.py") - - yield { - "embedder": embedder, - "vector_store": vector_store, - } - - # Clean up vector store cache - vector_store.clear_cache() - - except ImportError: - pytest.skip("Semantic search dependencies not installed") - - def test_embedding_generation_speed(self, semantic_setup): - """Benchmark embedding generation speed.""" - print("\n" + "="*60) - print("SEMANTIC SEARCH - EMBEDDING GENERATION") - print("="*60) - - embedder = semantic_setup["embedder"] - - # Single embedding - result = benchmark( - lambda: embedder.embed_single("def example_function(): return 42"), - iterations=50 - ) - result.name = "single embedding" - print(f"\n{result}") - - # Batch embedding - texts = ["def func{}(): return {}".format(i, i) for i in range(10)] - result = benchmark( - lambda: embedder.embed(texts), - iterations=20 - ) - result.name = "batch embedding (10 texts)" - print(f"\n{result}") - - def test_vector_search_speed(self, semantic_setup): - """Benchmark vector similarity search speed.""" - print("\n" + "="*60) - print("SEMANTIC SEARCH - VECTOR SEARCH") - print("="*60) - - embedder = semantic_setup["embedder"] - vector_store = semantic_setup["vector_store"] - - query_embedding = embedder.embed_single("user authentication login") - - # Different top_k values - for top_k in [5, 10, 20, 50]: - result = benchmark( - lambda k=top_k: vector_store.search_similar(query_embedding, top_k=k), - iterations=30 - ) - result.name = f"top_k={top_k}" - print(f"\n{result}") - - def test_full_semantic_search_latency(self, semantic_setup): - """Benchmark full semantic search (embed + search).""" - print("\n" + "="*60) - print("SEMANTIC SEARCH - FULL LATENCY") - print("="*60) - - embedder = semantic_setup["embedder"] - vector_store = semantic_setup["vector_store"] - - queries = [ - "user authentication", - "database connection", - "API request handler", - "React component", - "data processing", - ] - - for query in queries: - def full_search(q=query): - embedding = embedder.embed_single(q) - return vector_store.search_similar(embedding, top_k=10) - - result = benchmark(full_search, iterations=20) - result.name = f"'{query}'" - print(f"\n{result}") - - -# === Comparative Benchmarks === - -class TestComparativeBenchmarks: - """Compare FTS5 vs Semantic search performance.""" - - @pytest.fixture - def comparison_setup(self, temp_dir): - """Setup both FTS5 and semantic stores with same content.""" - # FTS5 store - fts_store = SQLiteStore(temp_dir / "fts_compare.db") - fts_store.initialize() - - code_samples = [ - ("auth.py", "def authenticate_user(username, password): verify credentials"), - ("db.py", "class DatabasePool: manage database connection pooling"), - ("api.py", "async def handle_request(req): process API request"), - ("ui.py", "function Button({ onClick }): render button component"), - ("utils.py", "def process_data(input): transform and validate data"), - ] * 20 - - for i, (filename, content) in enumerate(code_samples): - indexed_file = IndexedFile( - path=f"/project/{filename.replace('.py', '')}_{i}.py", - language="python", - symbols=[Symbol(name=f"func_{i}", kind="function", range=(1, 5))], - ) - fts_store.add_file(indexed_file, content) - - # Semantic store (if available) - try: - from codexlens.semantic import SEMANTIC_AVAILABLE - if SEMANTIC_AVAILABLE: - from codexlens.semantic.embedder import Embedder - from codexlens.semantic.vector_store import VectorStore - from codexlens.entities import SemanticChunk - - embedder = Embedder() - semantic_store = VectorStore(temp_dir / "semantic_compare.db") - - for i, (filename, content) in enumerate(code_samples): - chunk = SemanticChunk(content=content, metadata={"index": i}) - chunk.embedding = embedder.embed_single(content) - semantic_store.add_chunk(chunk, f"/project/{filename}") - - yield { - "fts_store": fts_store, - "semantic_store": semantic_store, - "embedder": embedder, - "has_semantic": True, - } - # Close semantic store connection - semantic_store.clear_cache() - else: - yield {"fts_store": fts_store, "has_semantic": False} - except ImportError: - yield {"fts_store": fts_store, "has_semantic": False} - - fts_store.close() - - def test_fts_vs_semantic_latency(self, comparison_setup): - """Compare FTS5 vs Semantic search latency.""" - print("\n" + "="*60) - print("FTS5 vs SEMANTIC - LATENCY COMPARISON") - print("="*60) - - fts_store = comparison_setup["fts_store"] - - queries = [ - "authenticate", - "database", - "request", - "button", - "process", - ] - - print("\nFTS5 Search:") - for query in queries: - result = benchmark( - lambda q=query: fts_store.search_fts(q, limit=10), - iterations=30 - ) - result.name = f"'{query}'" - print(f" {result.name}: avg={result.avg_time_ms:.2f}ms") - - if comparison_setup.get("has_semantic"): - semantic_store = comparison_setup["semantic_store"] - embedder = comparison_setup["embedder"] - - print("\nSemantic Search (embed + search):") - for query in queries: - def semantic_search(q=query): - emb = embedder.embed_single(q) - return semantic_store.search_similar(emb, top_k=10) - - result = benchmark(semantic_search, iterations=20) - result.name = f"'{query}'" - print(f" {result.name}: avg={result.avg_time_ms:.2f}ms") - else: - print("\n(Semantic search not available)") - - -# === Memory Usage Tests === - -class TestMemoryUsage: - """Memory usage during search operations.""" - - def test_search_memory_footprint(self, medium_store): - """Measure memory footprint during search.""" - print("\n" + "="*60) - print("MEMORY USAGE - SEARCH OPERATIONS") - print("="*60) - - import tracemalloc - - tracemalloc.start() - - # Run multiple searches - for _ in range(100): - medium_store.search_fts("function", limit=20) - - current, peak = tracemalloc.get_traced_memory() - tracemalloc.stop() - - print(f"\nAfter 100 FTS5 searches:") - print(f" Current memory: {current / 1024 / 1024:.2f} MB") - print(f" Peak memory: {peak / 1024 / 1024:.2f} MB") - - -if __name__ == "__main__": - pytest.main([__file__, "-v", "-s", "--tb=short"]) diff --git a/codex-lens/tests/test_semantic.py b/codex-lens/tests/test_semantic.py deleted file mode 100644 index 3470dbd4..00000000 --- a/codex-lens/tests/test_semantic.py +++ /dev/null @@ -1,290 +0,0 @@ -"""Tests for CodexLens semantic module.""" - -import pytest - -from codexlens.entities import SemanticChunk, Symbol -from codexlens.semantic.chunker import ChunkConfig, Chunker - - -class TestChunkConfig: - """Tests for ChunkConfig.""" - - def test_default_config(self): - """Test default configuration values.""" - config = ChunkConfig() - assert config.max_chunk_size == 1000 - assert config.overlap == 200 - assert config.min_chunk_size == 50 - - def test_custom_config(self): - """Test custom configuration.""" - config = ChunkConfig(max_chunk_size=2000, overlap=200, min_chunk_size=100) - assert config.max_chunk_size == 2000 - assert config.overlap == 200 - assert config.min_chunk_size == 100 - - -class TestChunker: - """Tests for Chunker class.""" - - def test_chunker_default_config(self): - """Test chunker with default config.""" - chunker = Chunker() - assert chunker.config.max_chunk_size == 1000 - - def test_chunker_custom_config(self): - """Test chunker with custom config.""" - config = ChunkConfig(max_chunk_size=500) - chunker = Chunker(config=config) - assert chunker.config.max_chunk_size == 500 - - -class TestChunkBySymbol: - """Tests for symbol-based chunking.""" - - def test_chunk_single_function(self): - """Test chunking a single function.""" - # Use config with smaller min_chunk_size - config = ChunkConfig(min_chunk_size=10) - chunker = Chunker(config=config) - content = "def hello():\n print('hello')\n return True\n" - symbols = [Symbol(name="hello", kind="function", range=(1, 3))] - - chunks = chunker.chunk_by_symbol(content, symbols, "test.py", "python") - - assert len(chunks) == 1 - assert "def hello():" in chunks[0].content - assert chunks[0].metadata["symbol_name"] == "hello" - assert chunks[0].metadata["symbol_kind"] == "function" - assert chunks[0].metadata["file"] == "test.py" - assert chunks[0].metadata["language"] == "python" - assert chunks[0].metadata["strategy"] == "symbol" - - def test_chunk_multiple_symbols(self): - """Test chunking multiple symbols.""" - # Use config with smaller min_chunk_size - config = ChunkConfig(min_chunk_size=5) - chunker = Chunker(config=config) - content = """def foo(): - pass - -def bar(): - pass - -class MyClass: - pass -""" - symbols = [ - Symbol(name="foo", kind="function", range=(1, 2)), - Symbol(name="bar", kind="function", range=(4, 5)), - Symbol(name="MyClass", kind="class", range=(7, 8)), - ] - - chunks = chunker.chunk_by_symbol(content, symbols, "test.py", "python") - - assert len(chunks) == 3 - names = [c.metadata["symbol_name"] for c in chunks] - assert "foo" in names - assert "bar" in names - assert "MyClass" in names - - def test_chunk_skips_small_content(self): - """Test that chunks smaller than min_chunk_size are skipped.""" - config = ChunkConfig(min_chunk_size=100) - chunker = Chunker(config=config) - content = "def x():\n pass\n" - symbols = [Symbol(name="x", kind="function", range=(1, 2))] - - chunks = chunker.chunk_by_symbol(content, symbols, "test.py", "python") - - assert len(chunks) == 0 # Content is too small - - def test_chunk_preserves_line_numbers(self): - """Test that chunks preserve correct line numbers.""" - config = ChunkConfig(min_chunk_size=5) - chunker = Chunker(config=config) - content = "# comment\ndef hello():\n pass\n" - symbols = [Symbol(name="hello", kind="function", range=(2, 3))] - - chunks = chunker.chunk_by_symbol(content, symbols, "test.py", "python") - - assert len(chunks) == 1 - assert chunks[0].metadata["start_line"] == 2 - assert chunks[0].metadata["end_line"] == 3 - - def test_chunk_handles_empty_symbols(self): - """Test chunking with empty symbols list.""" - chunker = Chunker() - content = "# just a comment" - symbols = [] - - chunks = chunker.chunk_by_symbol(content, symbols, "test.py", "python") - - assert len(chunks) == 0 - - -class TestChunkSlidingWindow: - """Tests for sliding window chunking.""" - - def test_sliding_window_basic(self): - """Test basic sliding window chunking.""" - config = ChunkConfig(max_chunk_size=100, overlap=20, min_chunk_size=10) - chunker = Chunker(config=config) - - # Create content with multiple lines - lines = [f"line {i} content here\n" for i in range(20)] - content = "".join(lines) - - chunks = chunker.chunk_sliding_window(content, "test.py", "python") - - assert len(chunks) > 0 - for chunk in chunks: - assert chunk.metadata["strategy"] == "sliding_window" - assert chunk.metadata["file"] == "test.py" - assert chunk.metadata["language"] == "python" - - def test_sliding_window_empty_content(self): - """Test sliding window with empty content.""" - chunker = Chunker() - chunks = chunker.chunk_sliding_window("", "test.py", "python") - assert len(chunks) == 0 - - def test_sliding_window_small_content(self): - """Test sliding window with content smaller than chunk size.""" - config = ChunkConfig(max_chunk_size=1000, min_chunk_size=10) - chunker = Chunker(config=config) - content = "small content here" - - chunks = chunker.chunk_sliding_window(content, "test.py", "python") - - # Small content should produce one chunk - assert len(chunks) <= 1 - - def test_sliding_window_chunk_indices(self): - """Test that chunk indices are sequential.""" - config = ChunkConfig(max_chunk_size=50, overlap=10, min_chunk_size=5) - chunker = Chunker(config=config) - lines = [f"line {i}\n" for i in range(50)] - content = "".join(lines) - - chunks = chunker.chunk_sliding_window(content, "test.py", "python") - - if len(chunks) > 1: - indices = [c.metadata["chunk_index"] for c in chunks] - assert indices == list(range(len(chunks))) - - -class TestChunkFile: - """Tests for chunk_file method.""" - - def test_chunk_file_with_symbols(self): - """Test chunk_file uses symbol-based chunking when symbols available.""" - chunker = Chunker() - content = "def hello():\n print('world')\n return 42\n" - symbols = [Symbol(name="hello", kind="function", range=(1, 3))] - - chunks = chunker.chunk_file(content, symbols, "test.py", "python") - - assert all(c.metadata["strategy"] == "symbol" for c in chunks) - - def test_chunk_file_without_symbols(self): - """Test chunk_file uses sliding window when no symbols.""" - config = ChunkConfig(min_chunk_size=5) - chunker = Chunker(config=config) - content = "# just comments\n# more comments\n# even more\n" - - chunks = chunker.chunk_file(content, [], "test.py", "python") - - # Should use sliding window strategy - if len(chunks) > 0: - assert all(c.metadata["strategy"] == "sliding_window" for c in chunks) - - -class TestChunkMetadata: - """Tests for chunk metadata.""" - - def test_symbol_chunk_metadata_complete(self): - """Test that symbol chunks have complete metadata.""" - config = ChunkConfig(min_chunk_size=10) - chunker = Chunker(config=config) - content = "class MyClass:\n def method(self):\n pass\n" - symbols = [Symbol(name="MyClass", kind="class", range=(1, 3))] - - chunks = chunker.chunk_by_symbol(content, symbols, "/path/to/file.py", "python") - - assert len(chunks) == 1 - meta = chunks[0].metadata - assert meta["file"] == "/path/to/file.py" - assert meta["language"] == "python" - assert meta["symbol_name"] == "MyClass" - assert meta["symbol_kind"] == "class" - assert meta["start_line"] == 1 - assert meta["end_line"] == 3 - assert meta["strategy"] == "symbol" - - def test_sliding_window_metadata_complete(self): - """Test that sliding window chunks have complete metadata.""" - config = ChunkConfig(min_chunk_size=5) - chunker = Chunker(config=config) - content = "some content here\nmore content\n" - - chunks = chunker.chunk_sliding_window(content, "/path/file.js", "javascript") - - if len(chunks) > 0: - meta = chunks[0].metadata - assert meta["file"] == "/path/file.js" - assert meta["language"] == "javascript" - assert "chunk_index" in meta - assert "start_line" in meta - assert "end_line" in meta - assert meta["strategy"] == "sliding_window" - - -class TestChunkEdgeCases: - """Edge case tests for chunking.""" - - def test_chunk_with_unicode(self): - """Test chunking content with unicode characters.""" - config = ChunkConfig(min_chunk_size=10) - chunker = Chunker(config=config) - content = "def 你好():\n print('世界')\n return '🎉'\n" - symbols = [Symbol(name="你好", kind="function", range=(1, 3))] - - chunks = chunker.chunk_by_symbol(content, symbols, "test.py", "python") - - assert len(chunks) == 1 - assert "你好" in chunks[0].content - - def test_chunk_with_windows_line_endings(self): - """Test chunking with Windows-style line endings.""" - chunker = Chunker() - content = "def hello():\r\n pass\r\n" - symbols = [Symbol(name="hello", kind="function", range=(1, 2))] - - chunks = chunker.chunk_by_symbol(content, symbols, "test.py", "python") - - # Should handle without errors - assert len(chunks) <= 1 - - def test_chunk_range_out_of_bounds(self): - """Test chunking when symbol range exceeds content.""" - chunker = Chunker() - content = "def hello():\n pass\n" - # Symbol range goes beyond content - symbols = [Symbol(name="hello", kind="function", range=(1, 100))] - - # Should not crash, just handle gracefully - chunks = chunker.chunk_by_symbol(content, symbols, "test.py", "python") - assert len(chunks) <= 1 - - def test_chunk_content_returned_as_semantic_chunk(self): - """Test that returned chunks are SemanticChunk instances.""" - chunker = Chunker() - content = "def test():\n return True\n" - symbols = [Symbol(name="test", kind="function", range=(1, 2))] - - chunks = chunker.chunk_by_symbol(content, symbols, "test.py", "python") - - for chunk in chunks: - assert isinstance(chunk, SemanticChunk) - assert chunk.embedding is None # Not embedded yet diff --git a/codex-lens/tests/test_semantic_search.py b/codex-lens/tests/test_semantic_search.py deleted file mode 100644 index 2bb781c7..00000000 --- a/codex-lens/tests/test_semantic_search.py +++ /dev/null @@ -1,804 +0,0 @@ -"""Comprehensive tests for semantic search functionality. - -Tests embedding generation, vector storage, and semantic similarity search -across complex codebases with various file types and content patterns. -""" - -import json -import os -import shutil -import tempfile -import time -from pathlib import Path -from typing import List, Dict, Any - -import pytest - -from codexlens.entities import SemanticChunk, Symbol -from codexlens.semantic import SEMANTIC_AVAILABLE, SEMANTIC_BACKEND, check_semantic_available - -# Skip all tests if semantic search not available -pytestmark = pytest.mark.skipif( - not SEMANTIC_AVAILABLE, - reason="Semantic search dependencies not installed" -) - - -class TestEmbedderPerformance: - """Test Embedder performance and quality.""" - - @pytest.fixture - def embedder(self): - """Create embedder instance.""" - from codexlens.semantic.embedder import Embedder - return Embedder() - - def test_single_embedding(self, embedder): - """Test single text embedding.""" - text = "def calculate_sum(a, b): return a + b" - - start = time.time() - embedding = embedder.embed_single(text) - elapsed = time.time() - start - - assert len(embedding) == 384, "Embedding dimension should be 384" - assert all(isinstance(x, float) for x in embedding) - print(f"\nSingle embedding time: {elapsed*1000:.2f}ms") - - def test_batch_embedding_performance(self, embedder): - """Test batch embedding performance.""" - texts = [ - "def hello(): print('world')", - "class Calculator: def add(self, a, b): return a + b", - "async def fetch_data(url): return await client.get(url)", - "const processData = (data) => data.map(x => x * 2)", - "function initializeApp() { console.log('Starting...'); }", - ] * 10 # 50 texts total - - start = time.time() - embeddings = embedder.embed(texts) - elapsed = time.time() - start - - assert len(embeddings) == len(texts) - print(f"\nBatch embedding ({len(texts)} texts): {elapsed*1000:.2f}ms") - print(f"Per-text average: {elapsed/len(texts)*1000:.2f}ms") - - def test_embedding_similarity(self, embedder): - """Test that similar code has similar embeddings.""" - from codexlens.semantic.vector_store import _cosine_similarity - - # Similar functions (should have high similarity) - code1 = "def add(a, b): return a + b" - code2 = "def sum_numbers(x, y): return x + y" - - # Different function (should have lower similarity) - code3 = "class UserAuthentication: def login(self, user, password): pass" - - emb1 = embedder.embed_single(code1) - emb2 = embedder.embed_single(code2) - emb3 = embedder.embed_single(code3) - - sim_12 = _cosine_similarity(emb1, emb2) - sim_13 = _cosine_similarity(emb1, emb3) - - print(f"\nSimilarity (add vs sum_numbers): {sim_12:.4f}") - print(f"Similarity (add vs login): {sim_13:.4f}") - - assert sim_12 > sim_13, "Similar code should have higher similarity" - assert sim_12 > 0.6, "Similar functions should have >0.6 similarity" - - -class TestVectorStore: - """Test VectorStore functionality.""" - - @pytest.fixture - def temp_db(self, tmp_path): - """Create temporary database.""" - return tmp_path / "semantic.db" - - @pytest.fixture - def vector_store(self, temp_db): - """Create vector store instance.""" - from codexlens.semantic.vector_store import VectorStore - return VectorStore(temp_db) - - @pytest.fixture - def embedder(self): - """Create embedder instance.""" - from codexlens.semantic.embedder import Embedder - return Embedder() - - def test_add_and_search_chunks(self, vector_store, embedder): - """Test adding chunks and searching.""" - # Create test chunks with embeddings - chunks = [ - SemanticChunk( - content="def calculate_sum(a, b): return a + b", - metadata={"symbol": "calculate_sum", "language": "python"} - ), - SemanticChunk( - content="class UserManager: def create_user(self): pass", - metadata={"symbol": "UserManager", "language": "python"} - ), - SemanticChunk( - content="async function fetchData(url) { return await fetch(url); }", - metadata={"symbol": "fetchData", "language": "javascript"} - ), - ] - - # Add embeddings - for chunk in chunks: - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, "/test/file.py") - - # Search for similar code - query = "function to add two numbers together" - query_embedding = embedder.embed_single(query) - - results = vector_store.search_similar(query_embedding, top_k=3) - - assert len(results) > 0, "Should find results" - assert "calculate_sum" in results[0].excerpt or "sum" in results[0].excerpt.lower() - - print(f"\nQuery: '{query}'") - for i, r in enumerate(results): - print(f" {i+1}. Score: {r.score:.4f} - {r.excerpt[:50]}...") - - def test_min_score_filtering(self, vector_store, embedder): - """Test minimum score filtering.""" - # Add a chunk - chunk = SemanticChunk( - content="def hello_world(): print('Hello, World!')", - metadata={} - ) - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, "/test/hello.py") - - # Search with unrelated query - query = "database connection pool management" - query_embedding = embedder.embed_single(query) - - # Low threshold - should find result - results_low = vector_store.search_similar(query_embedding, min_score=0.0) - - # High threshold - might filter out - results_high = vector_store.search_similar(query_embedding, min_score=0.8) - - print(f"\nResults with min_score=0.0: {len(results_low)}") - print(f"Results with min_score=0.8: {len(results_high)}") - - assert len(results_low) >= len(results_high) - - -class TestSemanticSearchIntegration: - """Integration tests for semantic search on real-like codebases.""" - - @pytest.fixture - def complex_codebase(self, tmp_path): - """Create a complex test codebase.""" - # Python files - (tmp_path / "src").mkdir() - (tmp_path / "src" / "auth.py").write_text(''' -"""Authentication module.""" - -class AuthenticationService: - """Handle user authentication and authorization.""" - - def __init__(self, secret_key: str): - self.secret_key = secret_key - self.token_expiry = 3600 - - def login(self, username: str, password: str) -> dict: - """Authenticate user and return JWT token.""" - user = self._validate_credentials(username, password) - if user: - return self._generate_token(user) - raise AuthError("Invalid credentials") - - def logout(self, token: str) -> bool: - """Invalidate user session.""" - return self._revoke_token(token) - - def verify_token(self, token: str) -> dict: - """Verify JWT token and return user claims.""" - pass - -def hash_password(password: str) -> str: - """Hash password using bcrypt.""" - import hashlib - return hashlib.sha256(password.encode()).hexdigest() -''') - - (tmp_path / "src" / "database.py").write_text(''' -"""Database connection and ORM.""" - -from typing import List, Optional - -class DatabaseConnection: - """Manage database connections with pooling.""" - - def __init__(self, connection_string: str, pool_size: int = 5): - self.connection_string = connection_string - self.pool_size = pool_size - self._pool = [] - - def connect(self) -> "Connection": - """Get connection from pool.""" - if self._pool: - return self._pool.pop() - return self._create_connection() - - def release(self, conn: "Connection"): - """Return connection to pool.""" - if len(self._pool) < self.pool_size: - self._pool.append(conn) - -class QueryBuilder: - """SQL query builder with fluent interface.""" - - def select(self, *columns) -> "QueryBuilder": - pass - - def where(self, condition: str) -> "QueryBuilder": - pass - - def execute(self) -> List[dict]: - pass -''') - - (tmp_path / "src" / "api.py").write_text(''' -"""REST API endpoints.""" - -from typing import List, Dict, Any - -class APIRouter: - """Route HTTP requests to handlers.""" - - def __init__(self): - self.routes = {} - - def get(self, path: str): - """Register GET endpoint.""" - def decorator(func): - self.routes[("GET", path)] = func - return func - return decorator - - def post(self, path: str): - """Register POST endpoint.""" - def decorator(func): - self.routes[("POST", path)] = func - return func - return decorator - -async def handle_request(method: str, path: str, body: Dict) -> Dict: - """Process incoming HTTP request.""" - pass - -def validate_json_schema(data: Dict, schema: Dict) -> bool: - """Validate request data against JSON schema.""" - pass -''') - - # JavaScript files - (tmp_path / "frontend").mkdir() - (tmp_path / "frontend" / "components.js").write_text(''' -/** - * React UI Components - */ - -class UserProfile extends Component { - constructor(props) { - super(props); - this.state = { user: null, loading: true }; - } - - async componentDidMount() { - const user = await fetchUserData(this.props.userId); - this.setState({ user, loading: false }); - } - - render() { - if (this.state.loading) return ; - return ; - } -} - -function Button({ onClick, children, variant = "primary" }) { - return ( - - ); -} - -const FormInput = ({ label, value, onChange, type = "text" }) => { - return ( -
- - -
- ); -}; -''') - - (tmp_path / "frontend" / "api.js").write_text(''' -/** - * API Client for backend communication - */ - -const API_BASE = "/api/v1"; - -async function fetchUserData(userId) { - const response = await fetch(`${API_BASE}/users/${userId}`); - if (!response.ok) throw new Error("Failed to fetch user"); - return response.json(); -} - -async function createUser(userData) { - const response = await fetch(`${API_BASE}/users`, { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify(userData) - }); - return response.json(); -} - -async function updateUserProfile(userId, updates) { - const response = await fetch(`${API_BASE}/users/${userId}`, { - method: "PATCH", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify(updates) - }); - return response.json(); -} - -class WebSocketClient { - constructor(url) { - this.url = url; - this.ws = null; - this.handlers = {}; - } - - connect() { - this.ws = new WebSocket(this.url); - this.ws.onmessage = (event) => this._handleMessage(event); - } - - on(eventType, handler) { - this.handlers[eventType] = handler; - } -} -''') - - return tmp_path - - @pytest.fixture - def indexed_codebase(self, complex_codebase, tmp_path): - """Index the complex codebase with semantic embeddings.""" - from codexlens.semantic.embedder import Embedder - from codexlens.semantic.vector_store import VectorStore - from codexlens.semantic.chunker import Chunker, ChunkConfig - from codexlens.parsers.factory import ParserFactory - from codexlens.config import Config - - db_path = tmp_path / "semantic.db" - vector_store = VectorStore(db_path) - embedder = Embedder() - config = Config() - factory = ParserFactory(config) - chunker = Chunker(ChunkConfig(min_chunk_size=20, max_chunk_size=500)) - - # Index all source files - indexed_files = [] - for ext in ["*.py", "*.js"]: - for file_path in complex_codebase.rglob(ext): - content = file_path.read_text() - language = "python" if file_path.suffix == ".py" else "javascript" - - # Parse symbols - parser = factory.get_parser(language) - indexed_file = parser.parse(content, file_path) - - # Create chunks - chunks = chunker.chunk_file( - content, - indexed_file.symbols, - str(file_path), - language - ) - - # Add embeddings and store - for chunk in chunks: - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, str(file_path)) - - indexed_files.append(str(file_path)) - - return { - "vector_store": vector_store, - "embedder": embedder, - "files": indexed_files, - "codebase_path": complex_codebase - } - - def test_semantic_search_accuracy(self, indexed_codebase): - """Test semantic search accuracy on complex queries.""" - vector_store = indexed_codebase["vector_store"] - embedder = indexed_codebase["embedder"] - - test_queries = [ - { - "query": "user authentication login function", - "expected_contains": ["login", "auth", "credential"], - "expected_not_contains": ["database", "button"] - }, - { - "query": "database connection pooling", - "expected_contains": ["connect", "pool", "database"], - "expected_not_contains": ["login", "button"] - }, - { - "query": "React component for user profile", - "expected_contains": ["UserProfile", "component", "render"], - "expected_not_contains": ["database", "auth"] - }, - { - "query": "HTTP API endpoint handler", - "expected_contains": ["API", "request", "handle"], - "expected_not_contains": ["UserProfile", "button"] - }, - { - "query": "form input UI element", - "expected_contains": ["input", "form", "label"], - "expected_not_contains": ["database", "auth"] - } - ] - - print("\n" + "="*60) - print("SEMANTIC SEARCH ACCURACY TEST") - print("="*60) - - for test in test_queries: - query = test["query"] - query_embedding = embedder.embed_single(query) - - results = vector_store.search_similar(query_embedding, top_k=5, min_score=0.3) - - print(f"\nQuery: '{query}'") - print("-" * 40) - - # Check results - all_excerpts = " ".join([r.excerpt.lower() for r in results]) - - found_expected = [] - for expected in test["expected_contains"]: - if expected.lower() in all_excerpts: - found_expected.append(expected) - - found_unexpected = [] - for unexpected in test["expected_not_contains"]: - if unexpected.lower() in all_excerpts: - found_unexpected.append(unexpected) - - for i, r in enumerate(results[:3]): - print(f" {i+1}. Score: {r.score:.4f}") - print(f" File: {Path(r.path).name}") - print(f" Excerpt: {r.excerpt[:80]}...") - - print(f"\n [OK] Found expected: {found_expected}") - if found_unexpected: - print(f" [WARN] Found unexpected: {found_unexpected}") - - def test_search_performance(self, indexed_codebase): - """Test search performance with various parameters.""" - vector_store = indexed_codebase["vector_store"] - embedder = indexed_codebase["embedder"] - - query = "function to handle user data" - query_embedding = embedder.embed_single(query) - - print("\n" + "="*60) - print("SEARCH PERFORMANCE TEST") - print("="*60) - - # Test different top_k values - for top_k in [5, 10, 20, 50]: - start = time.time() - results = vector_store.search_similar(query_embedding, top_k=top_k) - elapsed = time.time() - start - - print(f"top_k={top_k}: {elapsed*1000:.2f}ms ({len(results)} results)") - - # Test different min_score values - print("\nMin score filtering:") - for min_score in [0.0, 0.3, 0.5, 0.7]: - start = time.time() - results = vector_store.search_similar(query_embedding, top_k=50, min_score=min_score) - elapsed = time.time() - start - - print(f"min_score={min_score}: {elapsed*1000:.2f}ms ({len(results)} results)") - - -class TestChunkerOptimization: - """Test chunker parameters for optimal semantic search.""" - - @pytest.fixture - def sample_code(self): - """Long Python file for chunking tests.""" - return ''' -"""Large module with multiple classes and functions.""" - -import os -import sys -from typing import List, Dict, Any, Optional - -# Constants -MAX_RETRIES = 3 -DEFAULT_TIMEOUT = 30 - -class ConfigManager: - """Manage application configuration.""" - - def __init__(self, config_path: str): - self.config_path = config_path - self._config: Dict[str, Any] = {} - - def load(self) -> Dict[str, Any]: - """Load configuration from file.""" - with open(self.config_path) as f: - self._config = json.load(f) - return self._config - - def get(self, key: str, default: Any = None) -> Any: - """Get configuration value.""" - return self._config.get(key, default) - - def set(self, key: str, value: Any) -> None: - """Set configuration value.""" - self._config[key] = value - -class DataProcessor: - """Process and transform data.""" - - def __init__(self, source: str): - self.source = source - self.data: List[Dict] = [] - - def load_data(self) -> List[Dict]: - """Load data from source.""" - # Implementation here - pass - - def transform(self, transformers: List[callable]) -> List[Dict]: - """Apply transformations to data.""" - result = self.data - for transformer in transformers: - result = [transformer(item) for item in result] - return result - - def filter(self, predicate: callable) -> List[Dict]: - """Filter data by predicate.""" - return [item for item in self.data if predicate(item)] - - def aggregate(self, key: str, aggregator: callable) -> Dict: - """Aggregate data by key.""" - groups: Dict[str, List] = {} - for item in self.data: - k = item.get(key) - if k not in groups: - groups[k] = [] - groups[k].append(item) - return {k: aggregator(v) for k, v in groups.items()} - -def validate_input(data: Dict, schema: Dict) -> bool: - """Validate input data against schema.""" - for field, rules in schema.items(): - if rules.get("required") and field not in data: - return False - if field in data: - value = data[field] - if "type" in rules and not isinstance(value, rules["type"]): - return False - return True - -def format_output(data: Any, format_type: str = "json") -> str: - """Format output data.""" - if format_type == "json": - return json.dumps(data, indent=2) - elif format_type == "csv": - # CSV formatting - pass - return str(data) - -async def fetch_remote_data(url: str, timeout: int = DEFAULT_TIMEOUT) -> Dict: - """Fetch data from remote URL.""" - async with aiohttp.ClientSession() as session: - async with session.get(url, timeout=timeout) as response: - return await response.json() - -class CacheManager: - """Manage caching with TTL support.""" - - def __init__(self, default_ttl: int = 300): - self.default_ttl = default_ttl - self._cache: Dict[str, tuple] = {} - - def get(self, key: str) -> Optional[Any]: - """Get cached value if not expired.""" - if key in self._cache: - value, expiry = self._cache[key] - if time.time() < expiry: - return value - del self._cache[key] - return None - - def set(self, key: str, value: Any, ttl: Optional[int] = None) -> None: - """Set cached value with TTL.""" - expiry = time.time() + (ttl or self.default_ttl) - self._cache[key] = (value, expiry) - - def invalidate(self, pattern: str) -> int: - """Invalidate cache entries matching pattern.""" - keys_to_delete = [k for k in self._cache if pattern in k] - for k in keys_to_delete: - del self._cache[k] - return len(keys_to_delete) -''' - - def test_chunk_size_comparison(self, sample_code): - """Compare different chunk sizes for search quality.""" - from codexlens.semantic.chunker import Chunker, ChunkConfig - from codexlens.semantic.embedder import Embedder - from codexlens.semantic.vector_store import _cosine_similarity - from codexlens.parsers.factory import ParserFactory - from codexlens.config import Config - - config = Config() - factory = ParserFactory(config) - parser = factory.get_parser("python") - indexed_file = parser.parse(sample_code, Path("/test.py")) - embedder = Embedder() - - print("\n" + "="*60) - print("CHUNK SIZE OPTIMIZATION TEST") - print("="*60) - - # Test different chunk configurations - configs = [ - ChunkConfig(min_chunk_size=20, max_chunk_size=200, overlap=20), - ChunkConfig(min_chunk_size=50, max_chunk_size=500, overlap=50), - ChunkConfig(min_chunk_size=100, max_chunk_size=1000, overlap=100), - ] - - test_query = "cache management with TTL expiration" - query_embedding = embedder.embed_single(test_query) - - for cfg in configs: - chunker = Chunker(cfg) - chunks = chunker.chunk_file( - sample_code, - indexed_file.symbols, - "/test.py", - "python" - ) - - print(f"\nConfig: min={cfg.min_chunk_size}, max={cfg.max_chunk_size}, overlap={cfg.overlap}") - print(f" Chunks generated: {len(chunks)}") - - if chunks: - # Find best matching chunk - best_score = 0 - best_chunk = None - - for chunk in chunks: - chunk.embedding = embedder.embed_single(chunk.content) - score = _cosine_similarity(query_embedding, chunk.embedding) - if score > best_score: - best_score = score - best_chunk = chunk - - if best_chunk: - print(f" Best match score: {best_score:.4f}") - print(f" Best chunk preview: {best_chunk.content[:100]}...") - - def test_symbol_vs_sliding_window(self, sample_code): - """Compare symbol-based vs sliding window chunking.""" - from codexlens.semantic.chunker import Chunker, ChunkConfig - from codexlens.parsers.factory import ParserFactory - from codexlens.config import Config - - config = Config() - factory = ParserFactory(config) - parser = factory.get_parser("python") - indexed_file = parser.parse(sample_code, Path("/test.py")) - - chunker = Chunker(ChunkConfig(min_chunk_size=20)) - - print("\n" + "="*60) - print("CHUNKING STRATEGY COMPARISON") - print("="*60) - - # Symbol-based chunking - symbol_chunks = chunker.chunk_by_symbol( - sample_code, - indexed_file.symbols, - "/test.py", - "python" - ) - - # Sliding window chunking - window_chunks = chunker.chunk_sliding_window( - sample_code, - "/test.py", - "python" - ) - - print(f"\nSymbol-based chunks: {len(symbol_chunks)}") - for i, chunk in enumerate(symbol_chunks[:5]): - symbol_name = chunk.metadata.get("symbol_name", "unknown") - print(f" {i+1}. {symbol_name}: {len(chunk.content)} chars") - - print(f"\nSliding window chunks: {len(window_chunks)}") - for i, chunk in enumerate(window_chunks[:5]): - lines = f"{chunk.metadata.get('start_line', '?')}-{chunk.metadata.get('end_line', '?')}" - print(f" {i+1}. Lines {lines}: {len(chunk.content)} chars") - - -class TestRealWorldScenarios: - """Test real-world semantic search scenarios.""" - - @pytest.fixture - def embedder(self): - from codexlens.semantic.embedder import Embedder - return Embedder() - - def test_natural_language_queries(self, embedder): - """Test various natural language query patterns.""" - from codexlens.semantic.vector_store import _cosine_similarity - - code_samples = { - "auth": "def authenticate_user(username, password): verify credentials and create session", - "db": "class DatabasePool: manage connection pooling for efficient database access", - "api": "async def handle_http_request(req): process incoming REST API calls", - "ui": "function Button({ onClick }) { return }", - "cache": "class LRUCache: implements least recently used caching strategy with TTL", - } - - # Generate embeddings for code - code_embeddings = {k: embedder.embed_single(v) for k, v in code_samples.items()} - - # Test queries - queries = [ - ("How do I log in a user?", "auth"), - ("Database connection management", "db"), - ("REST endpoint handler", "api"), - ("Button component React", "ui"), - ("Caching with expiration", "cache"), - ] - - print("\n" + "="*60) - print("NATURAL LANGUAGE QUERY TEST") - print("="*60) - - correct = 0 - for query, expected_best in queries: - query_embedding = embedder.embed_single(query) - - scores = {k: _cosine_similarity(query_embedding, v) - for k, v in code_embeddings.items()} - - best_match = max(scores.items(), key=lambda x: x[1]) - is_correct = best_match[0] == expected_best - correct += is_correct - - status = "[OK]" if is_correct else "[FAIL]" - print(f"\n{status} Query: '{query}'") - print(f" Expected: {expected_best}, Got: {best_match[0]} (score: {best_match[1]:.4f})") - - accuracy = correct / len(queries) * 100 - print(f"\n\nAccuracy: {accuracy:.1f}% ({correct}/{len(queries)})") - - -if __name__ == "__main__": - pytest.main([__file__, "-v", "-s"]) diff --git a/codex-lens/tests/test_sqlite_store.py b/codex-lens/tests/test_sqlite_store.py deleted file mode 100644 index 63b7a227..00000000 --- a/codex-lens/tests/test_sqlite_store.py +++ /dev/null @@ -1,444 +0,0 @@ -"""Tests for SQLiteStore connection pool behavior.""" - -from __future__ import annotations - -import logging -import sqlite3 -import threading -import time -from pathlib import Path - -import pytest - -from codexlens.entities import IndexedFile -from codexlens.storage.sqlite_store import SQLiteStore - - -def test_periodic_cleanup(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - """Periodic timer should proactively clean up stale thread connections.""" - monkeypatch.setattr(SQLiteStore, "CLEANUP_INTERVAL", 0.2) - - store = SQLiteStore(tmp_path / "periodic_cleanup.db") - store.initialize() - - cleanup_called = threading.Event() - original_cleanup = store._cleanup_stale_connections - - def wrapped_cleanup() -> None: - cleanup_called.set() - original_cleanup() - - monkeypatch.setattr(store, "_cleanup_stale_connections", wrapped_cleanup) - - created: list[int] = [] - lock = threading.Lock() - main_tid = threading.get_ident() - - def worker() -> None: - store._get_connection() - with lock: - created.append(threading.get_ident()) - - try: - threads = [threading.Thread(target=worker) for _ in range(10)] - for t in threads: - t.start() - for t in threads: - t.join() - - # Ensure we created thread-local connections without reaching MAX_POOL_SIZE. - assert len(store._pool) >= 2 - assert all(tid in store._pool for tid in created) - - # Wait for periodic cleanup to run and prune dead thread connections. - assert cleanup_called.wait(timeout=3) - deadline = time.time() + 3 - while time.time() < deadline and any(tid in store._pool for tid in created): - time.sleep(0.05) - - assert all(tid not in store._pool for tid in created) - assert set(store._pool.keys()).issubset({main_tid}) - finally: - store.close() - - -def test_cleanup_robustness( - monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture, tmp_path: Path -) -> None: - """Cleanup should handle dead threads, idle timeouts, and invalid connections.""" - monkeypatch.setattr(SQLiteStore, "CLEANUP_INTERVAL", 0) - caplog.set_level(logging.DEBUG, logger="codexlens.storage.sqlite_store") - - store = SQLiteStore(tmp_path / "cleanup_robustness.db") - store.initialize() - - try: - # Invalid connection: active thread but pooled connection is already closed. - conn = store._get_connection() - conn.close() - with store._pool_lock: - store._pool[threading.get_ident()] = (conn, time.time()) - store._cleanup_stale_connections() - - assert "invalid_connection" in caplog.text - assert threading.get_ident() not in store._pool - - # Ensure next access recreates a working connection after cleanup. - fresh_conn = store._get_connection() - assert fresh_conn is not conn - - # Idle timeout cleanup should be logged distinctly. - with store._pool_lock: - store._pool[threading.get_ident()] = (fresh_conn, time.time() - store.IDLE_TIMEOUT - 1) - store._cleanup_stale_connections() - - assert "idle_timeout" in caplog.text - assert threading.get_ident() not in store._pool - - # Dead thread cleanup should be logged distinctly. - created: list[int] = [] - - def worker() -> None: - store._get_connection() - created.append(threading.get_ident()) - - t = threading.Thread(target=worker) - t.start() - t.join() - - dead_tid = created[0] - assert dead_tid in store._pool - with store._pool_lock: - store._cleanup_stale_connections() - - assert "dead_thread" in caplog.text - assert dead_tid not in store._pool - finally: - store.close() - - -def test_add_files_rollback_preserves_original_exception(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """add_files should re-raise the transaction error when rollback succeeds.""" - monkeypatch.setattr(SQLiteStore, "CLEANUP_INTERVAL", 0) - store = SQLiteStore(tmp_path / "add_files_ok.db") - store.initialize() - - real_conn = store._get_connection() - - class FailingConnection: - def __init__(self, conn: sqlite3.Connection) -> None: - self._conn = conn - self.rollback_calls = 0 - - def execute(self, sql: str, params: tuple = ()): - if "INSERT INTO files" in sql: - raise sqlite3.OperationalError("boom") - return self._conn.execute(sql, params) - - def executemany(self, sql: str, seq): - return self._conn.executemany(sql, seq) - - def commit(self) -> None: - self._conn.commit() - - def rollback(self) -> None: - self.rollback_calls += 1 - self._conn.rollback() - - wrapped = FailingConnection(real_conn) - monkeypatch.setattr(store, "_get_connection", lambda: wrapped) - - indexed_file = IndexedFile(path=str(tmp_path / "a.py"), language="python", symbols=[]) - - try: - with pytest.raises(sqlite3.OperationalError, match="boom"): - store.add_files([(indexed_file, "# content")]) - assert wrapped.rollback_calls == 1 - finally: - store.close() - - -def test_add_files_rollback_failure_is_chained( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture -) -> None: - """Rollback failures should be logged and chained as the cause.""" - monkeypatch.setattr(SQLiteStore, "CLEANUP_INTERVAL", 0) - caplog.set_level(logging.ERROR, logger="codexlens.storage.sqlite_store") - - store = SQLiteStore(tmp_path / "add_files_rollback_fail.db") - store.initialize() - real_conn = store._get_connection() - - class FailingRollbackConnection: - def __init__(self, conn: sqlite3.Connection) -> None: - self._conn = conn - - def execute(self, sql: str, params: tuple = ()): - if "INSERT INTO files" in sql: - raise sqlite3.OperationalError("boom") - return self._conn.execute(sql, params) - - def executemany(self, sql: str, seq): - return self._conn.executemany(sql, seq) - - def commit(self) -> None: - self._conn.commit() - - def rollback(self) -> None: - raise sqlite3.OperationalError("rollback boom") - - monkeypatch.setattr(store, "_get_connection", lambda: FailingRollbackConnection(real_conn)) - indexed_file = IndexedFile(path=str(tmp_path / "b.py"), language="python", symbols=[]) - - try: - with pytest.raises(sqlite3.OperationalError) as exc: - store.add_files([(indexed_file, "# content")]) - - assert exc.value.__cause__ is not None - assert isinstance(exc.value.__cause__, sqlite3.OperationalError) - assert "rollback boom" in str(exc.value.__cause__) - assert "Rollback failed after add_files() error" in caplog.text - assert "boom" in caplog.text - finally: - store.close() - - -class TestMultiVectorChunks: - """Tests for multi-vector chunk storage operations.""" - - def test_add_chunks_basic(self, tmp_path: Path) -> None: - """Basic chunk insertion without embeddings.""" - store = SQLiteStore(tmp_path / "chunks_basic.db") - store.initialize() - - try: - chunks_data = [ - {"content": "def hello(): pass", "metadata": {"type": "function"}}, - {"content": "class World: pass", "metadata": {"type": "class"}}, - ] - - ids = store.add_chunks("test.py", chunks_data) - - assert len(ids) == 2 - assert ids == [1, 2] - assert store.count_chunks() == 2 - finally: - store.close() - - def test_add_chunks_with_binary_embeddings(self, tmp_path: Path) -> None: - """Chunk insertion with binary embeddings for coarse ranking.""" - store = SQLiteStore(tmp_path / "chunks_binary.db") - store.initialize() - - try: - chunks_data = [ - {"content": "content1"}, - {"content": "content2"}, - ] - # 256-bit binary = 32 bytes - binary_embs = [b"\x00" * 32, b"\xff" * 32] - - ids = store.add_chunks( - "test.py", chunks_data, embedding_binary=binary_embs - ) - - assert len(ids) == 2 - - retrieved = store.get_binary_embeddings(ids) - assert len(retrieved) == 2 - assert retrieved[ids[0]] == b"\x00" * 32 - assert retrieved[ids[1]] == b"\xff" * 32 - finally: - store.close() - - def test_add_chunks_with_dense_embeddings(self, tmp_path: Path) -> None: - """Chunk insertion with dense embeddings for fine ranking.""" - store = SQLiteStore(tmp_path / "chunks_dense.db") - store.initialize() - - try: - chunks_data = [{"content": "content1"}, {"content": "content2"}] - # 2048 floats = 8192 bytes - dense_embs = [b"\x00" * 8192, b"\xff" * 8192] - - ids = store.add_chunks( - "test.py", chunks_data, embedding_dense=dense_embs - ) - - assert len(ids) == 2 - - retrieved = store.get_dense_embeddings(ids) - assert len(retrieved) == 2 - assert retrieved[ids[0]] == b"\x00" * 8192 - assert retrieved[ids[1]] == b"\xff" * 8192 - finally: - store.close() - - def test_add_chunks_with_all_embeddings(self, tmp_path: Path) -> None: - """Chunk insertion with all embedding types.""" - store = SQLiteStore(tmp_path / "chunks_all.db") - store.initialize() - - try: - chunks_data = [{"content": "full test"}] - embedding = [[0.1, 0.2, 0.3]] - binary_embs = [b"\xab" * 32] - dense_embs = [b"\xcd" * 8192] - - ids = store.add_chunks( - "test.py", - chunks_data, - embedding=embedding, - embedding_binary=binary_embs, - embedding_dense=dense_embs, - ) - - assert len(ids) == 1 - - binary = store.get_binary_embeddings(ids) - dense = store.get_dense_embeddings(ids) - - assert binary[ids[0]] == b"\xab" * 32 - assert dense[ids[0]] == b"\xcd" * 8192 - finally: - store.close() - - def test_add_chunks_length_mismatch_raises(self, tmp_path: Path) -> None: - """Mismatched embedding length should raise ValueError.""" - store = SQLiteStore(tmp_path / "chunks_mismatch.db") - store.initialize() - - try: - chunks_data = [{"content": "a"}, {"content": "b"}] - - with pytest.raises(ValueError, match="embedding_binary length"): - store.add_chunks( - "test.py", chunks_data, embedding_binary=[b"\x00" * 32] - ) - - with pytest.raises(ValueError, match="embedding_dense length"): - store.add_chunks( - "test.py", chunks_data, embedding_dense=[b"\x00" * 8192] - ) - - with pytest.raises(ValueError, match="embedding length"): - store.add_chunks( - "test.py", chunks_data, embedding=[[0.1]] - ) - finally: - store.close() - - def test_get_chunks_by_ids(self, tmp_path: Path) -> None: - """Retrieve chunk data by IDs.""" - store = SQLiteStore(tmp_path / "chunks_get.db") - store.initialize() - - try: - chunks_data = [ - {"content": "def foo(): pass", "metadata": {"line": 1}}, - {"content": "def bar(): pass", "metadata": {"line": 5}}, - ] - - ids = store.add_chunks("test.py", chunks_data) - retrieved = store.get_chunks_by_ids(ids) - - assert len(retrieved) == 2 - assert retrieved[0]["content"] == "def foo(): pass" - assert retrieved[0]["metadata"]["line"] == 1 - assert retrieved[1]["content"] == "def bar(): pass" - assert retrieved[1]["file_path"] == "test.py" - finally: - store.close() - - def test_delete_chunks_by_file(self, tmp_path: Path) -> None: - """Delete all chunks for a file.""" - store = SQLiteStore(tmp_path / "chunks_delete.db") - store.initialize() - - try: - store.add_chunks("a.py", [{"content": "a1"}, {"content": "a2"}]) - store.add_chunks("b.py", [{"content": "b1"}]) - - assert store.count_chunks() == 3 - - deleted = store.delete_chunks_by_file("a.py") - assert deleted == 2 - assert store.count_chunks() == 1 - - deleted = store.delete_chunks_by_file("nonexistent.py") - assert deleted == 0 - finally: - store.close() - - def test_get_embeddings_empty_list(self, tmp_path: Path) -> None: - """Empty chunk ID list returns empty dict.""" - store = SQLiteStore(tmp_path / "chunks_empty.db") - store.initialize() - - try: - assert store.get_binary_embeddings([]) == {} - assert store.get_dense_embeddings([]) == {} - assert store.get_chunks_by_ids([]) == [] - finally: - store.close() - - def test_add_chunks_empty_list(self, tmp_path: Path) -> None: - """Empty chunks list returns empty IDs.""" - store = SQLiteStore(tmp_path / "chunks_empty_add.db") - store.initialize() - - try: - ids = store.add_chunks("test.py", []) - assert ids == [] - assert store.count_chunks() == 0 - finally: - store.close() - - def test_chunks_table_migration(self, tmp_path: Path) -> None: - """Existing chunks table gets new columns via migration.""" - db_path = tmp_path / "chunks_migration.db" - - # Create old schema without multi-vector columns - conn = sqlite3.connect(db_path) - conn.execute( - """ - CREATE TABLE chunks ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - file_path TEXT NOT NULL, - content TEXT NOT NULL, - embedding BLOB, - metadata TEXT, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - """ - ) - conn.execute("CREATE INDEX idx_chunks_file_path ON chunks(file_path)") - conn.execute( - "INSERT INTO chunks (file_path, content) VALUES ('old.py', 'old content')" - ) - conn.commit() - conn.close() - - # Open with SQLiteStore - should migrate - store = SQLiteStore(db_path) - store.initialize() - - try: - # Verify new columns exist by using them - ids = store.add_chunks( - "new.py", - [{"content": "new content"}], - embedding_binary=[b"\x00" * 32], - embedding_dense=[b"\x00" * 8192], - ) - - assert len(ids) == 1 - - # Old data should still be accessible - assert store.count_chunks() == 2 - - # New embeddings should work - binary = store.get_binary_embeddings(ids) - assert binary[ids[0]] == b"\x00" * 32 - finally: - store.close() diff --git a/codex-lens/tests/test_stage1_binary_search_uses_chunk_lines.py b/codex-lens/tests/test_stage1_binary_search_uses_chunk_lines.py deleted file mode 100644 index ed566c9b..00000000 --- a/codex-lens/tests/test_stage1_binary_search_uses_chunk_lines.py +++ /dev/null @@ -1,131 +0,0 @@ -from __future__ import annotations - -from pathlib import Path -from unittest.mock import MagicMock, patch - -from codexlens.config import VECTORS_META_DB_NAME, Config -from codexlens.search.chain_search import ChainSearchEngine, SearchStats -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - - -def test_stage1_binary_search_prefers_chunk_start_line(tmp_path: Path) -> None: - registry = RegistryStore(db_path=tmp_path / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=tmp_path / "indexes") - engine = ChainSearchEngine(registry, mapper, config=Config(data_dir=tmp_path / "data")) - - try: - index_root = tmp_path / "fake_index_root" - index_root.mkdir(parents=True, exist_ok=True) - index_db = index_root / "_index.db" - index_db.write_text("", encoding="utf-8") - (index_root / VECTORS_META_DB_NAME).write_text("", encoding="utf-8") - - class _DummyBinarySearcher: - def search(self, query_dense, top_k: int): - _ = query_dense - _ = top_k - return [(123, 10)] - - class _DummyEmbedder: - def embed_to_numpy(self, texts): - _ = texts - return [[0.0]] - - dummy_meta_store = MagicMock() - dummy_meta_store.get_chunks_by_ids.return_value = [ - { - "chunk_id": 123, - "file_path": str(tmp_path / "a.py"), - "content": "def a():\n return 1\n", - "start_line": 12, - "end_line": 14, - "metadata": {}, - "category": "code", - } - ] - - with patch.object(engine, "_get_centralized_binary_searcher", return_value=_DummyBinarySearcher()): - with patch("codexlens.search.chain_search.VectorMetadataStore", return_value=dummy_meta_store): - with patch("codexlens.semantic.embedder.Embedder", return_value=_DummyEmbedder()): - coarse_results, returned_root = engine._stage1_binary_search( - "a", - [index_db], - coarse_k=1, - stats=SearchStats(), - ) - - assert returned_root == index_root - assert len(coarse_results) == 1 - assert coarse_results[0].start_line == 12 - assert coarse_results[0].end_line == 14 - finally: - engine.close() - - -def test_stage1_binary_search_dense_fallback(tmp_path: Path) -> None: - registry = RegistryStore(db_path=tmp_path / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=tmp_path / "indexes") - engine = ChainSearchEngine(registry, mapper, config=Config(data_dir=tmp_path / "data")) - - try: - index_root = tmp_path / "fake_index_root" - index_root.mkdir(parents=True, exist_ok=True) - index_db = index_root / "_index.db" - index_db.write_text("", encoding="utf-8") - (index_root / VECTORS_META_DB_NAME).write_text("", encoding="utf-8") - - class _DummyEmbedder: - def embed_to_numpy(self, texts): - _ = texts - # Only dim matters for ANNIndex initialization - return [[0.0, 1.0, 2.0]] - - class _DummyANNIndex: - def __init__(self, *args, **kwargs) -> None: - pass - - def load(self) -> bool: - return True - - def count(self) -> int: - return 1 - - def search(self, query_vec, top_k: int = 10): - _ = query_vec - _ = top_k - return [123], [0.2] - - dummy_meta_store = MagicMock() - dummy_meta_store.get_chunks_by_ids.return_value = [ - { - "chunk_id": 123, - "file_path": str(tmp_path / "b.py"), - "content": "def b():\n return 2\n", - "start_line": 20, - "end_line": 22, - "metadata": {}, - "category": "code", - } - ] - - with patch.object(engine, "_get_centralized_binary_searcher", return_value=None): - with patch("codexlens.search.chain_search.VectorMetadataStore", return_value=dummy_meta_store): - with patch("codexlens.semantic.embedder.Embedder", return_value=_DummyEmbedder()): - with patch("codexlens.semantic.ann_index.ANNIndex", _DummyANNIndex): - coarse_results, returned_root = engine._stage1_binary_search( - "b", - [index_db], - coarse_k=1, - stats=SearchStats(), - ) - - assert returned_root == index_root - assert len(coarse_results) == 1 - assert coarse_results[0].start_line == 20 - assert coarse_results[0].end_line == 22 - assert coarse_results[0].score == 0.8 - finally: - engine.close() diff --git a/codex-lens/tests/test_staged_cascade.py b/codex-lens/tests/test_staged_cascade.py deleted file mode 100644 index 2a5f44b4..00000000 --- a/codex-lens/tests/test_staged_cascade.py +++ /dev/null @@ -1,812 +0,0 @@ -"""Integration tests for staged cascade search pipeline. - -Tests the 4-stage pipeline: -1. Stage 1: Binary coarse search -2. Stage 2: LSP graph expansion -3. Stage 3: Clustering and representative selection -4. Stage 4: Optional cross-encoder reranking -""" - -from __future__ import annotations - -import json -import tempfile -from pathlib import Path -from typing import List -from unittest.mock import MagicMock, Mock, patch - -import pytest - -from codexlens.config import Config -from codexlens.entities import SearchResult -from codexlens.search.chain_search import ChainSearchEngine, SearchOptions -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - - -# ============================================================================= -# Test Fixtures -# ============================================================================= - - -@pytest.fixture -def temp_paths(): - """Create temporary directory structure.""" - tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) - root = Path(tmpdir.name) - yield root - try: - tmpdir.cleanup() - except (PermissionError, OSError): - pass - - -@pytest.fixture -def mock_registry(temp_paths: Path): - """Create mock registry store.""" - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - return registry - - -@pytest.fixture -def mock_mapper(temp_paths: Path): - """Create path mapper.""" - return PathMapper(index_root=temp_paths / "indexes") - - -@pytest.fixture -def mock_config(): - """Create mock config with staged cascade settings.""" - config = MagicMock(spec=Config) - config.cascade_coarse_k = 100 - config.cascade_fine_k = 10 - config.enable_staged_rerank = False - config.staged_clustering_strategy = "auto" - config.staged_clustering_min_size = 3 - config.graph_expansion_depth = 2 - return config - - -@pytest.fixture -def sample_binary_results() -> List[SearchResult]: - """Create sample binary search results for testing.""" - return [ - SearchResult( - path="a.py", - score=0.95, - excerpt="def authenticate_user(username, password):", - symbol_name="authenticate_user", - symbol_kind="function", - start_line=10, - end_line=15, - ), - SearchResult( - path="b.py", - score=0.85, - excerpt="class AuthManager:", - symbol_name="AuthManager", - symbol_kind="class", - start_line=5, - end_line=20, - ), - SearchResult( - path="c.py", - score=0.75, - excerpt="def check_credentials(user, pwd):", - symbol_name="check_credentials", - symbol_kind="function", - start_line=30, - end_line=35, - ), - ] - - -@pytest.fixture -def sample_expanded_results() -> List[SearchResult]: - """Create sample expanded results (after LSP expansion).""" - return [ - SearchResult( - path="a.py", - score=0.95, - excerpt="def authenticate_user(username, password):", - symbol_name="authenticate_user", - symbol_kind="function", - ), - SearchResult( - path="a.py", - score=0.90, - excerpt="def verify_password(pwd):", - symbol_name="verify_password", - symbol_kind="function", - ), - SearchResult( - path="b.py", - score=0.85, - excerpt="class AuthManager:", - symbol_name="AuthManager", - symbol_kind="class", - ), - SearchResult( - path="b.py", - score=0.80, - excerpt="def login(self, user):", - symbol_name="login", - symbol_kind="function", - ), - SearchResult( - path="c.py", - score=0.75, - excerpt="def check_credentials(user, pwd):", - symbol_name="check_credentials", - symbol_kind="function", - ), - SearchResult( - path="d.py", - score=0.70, - excerpt="class UserModel:", - symbol_name="UserModel", - symbol_kind="class", - ), - ] - - -# ============================================================================= -# Test Stage Methods -# ============================================================================= - - -class TestStage1BinarySearch: - """Tests for Stage 1: Binary coarse search.""" - - def test_stage1_returns_results_with_index_root( - self, mock_registry, mock_mapper, mock_config - ): - """Test _stage1_binary_search returns results and index_root.""" - from codexlens.search.chain_search import SearchStats - - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - # Mock the binary embedding backend (import is inside the method) - with patch("codexlens.indexing.embedding.BinaryEmbeddingBackend"): - with patch.object(engine, "_get_or_create_binary_index") as mock_binary_idx: - mock_index = MagicMock() - mock_index.count.return_value = 10 - mock_index.search.return_value = ([1, 2, 3], [10, 20, 30]) - mock_binary_idx.return_value = mock_index - - index_paths = [Path("/fake/index1/_index.db")] - stats = SearchStats() - - results, index_root = engine._stage1_binary_search( - "query", index_paths, coarse_k=10, stats=stats - ) - - assert isinstance(results, list) - assert isinstance(index_root, (Path, type(None))) - - def test_stage1_handles_empty_index_paths( - self, mock_registry, mock_mapper, mock_config - ): - """Test _stage1_binary_search handles empty index paths.""" - from codexlens.search.chain_search import SearchStats - - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - index_paths = [] - stats = SearchStats() - - results, index_root = engine._stage1_binary_search( - "query", index_paths, coarse_k=10, stats=stats - ) - - assert results == [] - assert index_root is None - - def test_stage1_aggregates_results_from_multiple_indexes( - self, mock_registry, mock_mapper, mock_config - ): - """Test _stage1_binary_search aggregates results from multiple indexes.""" - from codexlens.search.chain_search import SearchStats - - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch("codexlens.indexing.embedding.BinaryEmbeddingBackend"): - with patch.object(engine, "_get_or_create_binary_index") as mock_binary_idx: - mock_index = MagicMock() - mock_index.count.return_value = 10 - # Return different results for different calls - mock_index.search.side_effect = [ - ([1, 2], [10, 20]), - ([3, 4], [15, 25]), - ] - mock_binary_idx.return_value = mock_index - - index_paths = [ - Path("/fake/index1/_index.db"), - Path("/fake/index2/_index.db"), - ] - stats = SearchStats() - - results, _ = engine._stage1_binary_search( - "query", index_paths, coarse_k=10, stats=stats - ) - - # Should aggregate candidates from both indexes - assert isinstance(results, list) - - -class TestStage2LSPExpand: - """Tests for Stage 2: LSP graph expansion.""" - - def test_stage2_returns_expanded_results( - self, mock_registry, mock_mapper, mock_config, sample_binary_results - ): - """Test _stage2_lsp_expand returns expanded results.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - # Import is inside the method, so we need to patch it there - with patch("codexlens.search.graph_expander.GraphExpander") as mock_expander_cls: - mock_expander = MagicMock() - mock_expander.expand.return_value = [ - SearchResult(path="related.py", score=0.7, excerpt="related") - ] - mock_expander_cls.return_value = mock_expander - - expanded = engine._stage2_lsp_expand( - sample_binary_results, index_root=Path("/fake/index") - ) - - assert isinstance(expanded, list) - # Should include original results - assert len(expanded) >= len(sample_binary_results) - - def test_stage2_handles_no_index_root( - self, mock_registry, mock_mapper, mock_config, sample_binary_results - ): - """Test _stage2_lsp_expand handles missing index_root.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - expanded = engine._stage2_lsp_expand(sample_binary_results, index_root=None) - - # Should return original results unchanged - assert expanded == sample_binary_results - - def test_stage2_handles_empty_results( - self, mock_registry, mock_mapper, mock_config - ): - """Test _stage2_lsp_expand handles empty input.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - expanded = engine._stage2_lsp_expand([], index_root=Path("/fake")) - - assert expanded == [] - - def test_stage2_deduplicates_results( - self, mock_registry, mock_mapper, mock_config, sample_binary_results - ): - """Test _stage2_lsp_expand deduplicates by (path, symbol_name, start_line).""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - # Mock expander to return duplicate of first result - with patch("codexlens.search.graph_expander.GraphExpander") as mock_expander_cls: - mock_expander = MagicMock() - duplicate = SearchResult( - path=sample_binary_results[0].path, - score=0.5, - excerpt="duplicate", - symbol_name=sample_binary_results[0].symbol_name, - start_line=sample_binary_results[0].start_line, - ) - mock_expander.expand.return_value = [duplicate] - mock_expander_cls.return_value = mock_expander - - expanded = engine._stage2_lsp_expand( - sample_binary_results, index_root=Path("/fake") - ) - - # Should not include duplicate - assert len(expanded) == len(sample_binary_results) - - -class TestStage3ClusterPrune: - """Tests for Stage 3: Clustering and representative selection.""" - - def test_stage3_returns_representatives( - self, mock_registry, mock_mapper, mock_config, sample_expanded_results - ): - """Test _stage3_cluster_prune returns representative results.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch.object(engine, "_get_embeddings_for_clustering") as mock_embed: - import numpy as np - - # Mock embeddings - mock_embed.return_value = np.random.rand( - len(sample_expanded_results), 128 - ).astype(np.float32) - - clustered = engine._stage3_cluster_prune( - sample_expanded_results, target_count=3 - ) - - assert isinstance(clustered, list) - assert len(clustered) <= len(sample_expanded_results) - assert all(isinstance(r, SearchResult) for r in clustered) - - def test_stage3_handles_few_results( - self, mock_registry, mock_mapper, mock_config - ): - """Test _stage3_cluster_prune skips clustering for few results.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - few_results = [ - SearchResult(path="a.py", score=0.9, excerpt="a"), - SearchResult(path="b.py", score=0.8, excerpt="b"), - ] - - clustered = engine._stage3_cluster_prune(few_results, target_count=5) - - # Should return all results unchanged - assert clustered == few_results - - def test_stage3_handles_no_embeddings( - self, mock_registry, mock_mapper, mock_config, sample_expanded_results - ): - """Test _stage3_cluster_prune falls back to score-based selection without embeddings.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch.object(engine, "_get_embeddings_for_clustering") as mock_embed: - mock_embed.return_value = None - - clustered = engine._stage3_cluster_prune( - sample_expanded_results, target_count=3 - ) - - # Should return top-scored results - assert len(clustered) <= 3 - # Should be sorted by score descending - scores = [r.score for r in clustered] - assert scores == sorted(scores, reverse=True) - - def test_stage3_uses_config_clustering_strategy( - self, mock_registry, mock_mapper, sample_expanded_results - ): - """Test _stage3_cluster_prune uses config clustering strategy.""" - config = MagicMock(spec=Config) - config.staged_clustering_strategy = "auto" - config.staged_clustering_min_size = 2 - - engine = ChainSearchEngine(mock_registry, PathMapper(), config=config) - - with patch.object(engine, "_get_embeddings_for_clustering") as mock_embed: - import numpy as np - - mock_embed.return_value = np.random.rand( - len(sample_expanded_results), 128 - ).astype(np.float32) - - clustered = engine._stage3_cluster_prune( - sample_expanded_results, target_count=3 - ) - - # Should use clustering (auto will pick best available) - # Result should be a list of SearchResult objects - assert isinstance(clustered, list) - assert all(isinstance(r, SearchResult) for r in clustered) - - -class TestStage4OptionalRerank: - """Tests for Stage 4: Optional cross-encoder reranking.""" - - def test_stage4_reranks_with_reranker( - self, mock_registry, mock_mapper, temp_paths - ): - """Test _stage4_optional_rerank overfetches before final trim.""" - config = Config(data_dir=temp_paths / "data") - config.reranker_top_k = 4 - config.reranking_top_k = 4 - engine = ChainSearchEngine(mock_registry, mock_mapper, config=config) - - results = [ - SearchResult(path="a.py", score=0.9, excerpt="a"), - SearchResult(path="b.py", score=0.8, excerpt="b"), - SearchResult(path="c.py", score=0.7, excerpt="c"), - SearchResult(path="d.py", score=0.6, excerpt="d"), - SearchResult(path="e.py", score=0.5, excerpt="e"), - ] - - # Mock the _cross_encoder_rerank method that _stage4 calls - with patch.object(engine, "_cross_encoder_rerank") as mock_rerank: - mock_rerank.return_value = [ - SearchResult(path="c.py", score=0.95, excerpt="c"), - SearchResult(path="a.py", score=0.85, excerpt="a"), - SearchResult(path="d.py", score=0.83, excerpt="d"), - SearchResult(path="e.py", score=0.81, excerpt="e"), - ] - - reranked = engine._stage4_optional_rerank("query", results, k=2) - - mock_rerank.assert_called_once_with("query", results, 4) - assert len(reranked) == 4 - # First result should be reranked winner - assert reranked[0].path == "c.py" - - def test_stage4_handles_empty_results( - self, mock_registry, mock_mapper, mock_config - ): - """Test _stage4_optional_rerank handles empty input.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - reranked = engine._stage4_optional_rerank("query", [], k=2) - - # Should return empty list - assert reranked == [] - - -# ============================================================================= -# Integration Tests -# ============================================================================= - - -class TestStagedCascadeIntegration: - """Integration tests for staged_cascade_search() end-to-end.""" - - def test_staged_cascade_returns_chain_result( - self, mock_registry, mock_mapper, mock_config, temp_paths - ): - """Test staged_cascade_search returns ChainSearchResult.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - # Mock all stages - with patch.object(engine, "_find_start_index") as mock_find: - mock_find.return_value = temp_paths / "index" / "_index.db" - - with patch.object(engine, "_collect_index_paths") as mock_collect: - mock_collect.return_value = [temp_paths / "index" / "_index.db"] - - with patch.object(engine, "_stage1_binary_search") as mock_stage1: - mock_stage1.return_value = ( - [SearchResult(path="a.py", score=0.9, excerpt="a")], - temp_paths / "index", - ) - - with patch.object(engine, "_stage2_lsp_expand") as mock_stage2: - mock_stage2.return_value = [ - SearchResult(path="a.py", score=0.9, excerpt="a") - ] - - with patch.object(engine, "_stage3_cluster_prune") as mock_stage3: - mock_stage3.return_value = [ - SearchResult(path="a.py", score=0.9, excerpt="a") - ] - - result = engine.staged_cascade_search( - "query", temp_paths / "src", k=10, coarse_k=100 - ) - - from codexlens.search.chain_search import ChainSearchResult - - assert isinstance(result, ChainSearchResult) - assert result.query == "query" - assert len(result.results) <= 10 - - def test_staged_cascade_includes_stage_stats( - self, mock_registry, mock_mapper, mock_config, temp_paths - ): - """Test staged_cascade_search includes per-stage timing stats.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch.object(engine, "_find_start_index") as mock_find: - mock_find.return_value = temp_paths / "index" / "_index.db" - - with patch.object(engine, "_collect_index_paths") as mock_collect: - mock_collect.return_value = [temp_paths / "index" / "_index.db"] - - with patch.object(engine, "_stage1_binary_search") as mock_stage1: - mock_stage1.return_value = ( - [SearchResult(path="a.py", score=0.9, excerpt="a")], - temp_paths / "index", - ) - - with patch.object(engine, "_stage2_lsp_expand") as mock_stage2: - mock_stage2.return_value = [ - SearchResult(path="a.py", score=0.9, excerpt="a") - ] - - with patch.object(engine, "_stage3_cluster_prune") as mock_stage3: - mock_stage3.return_value = [ - SearchResult(path="a.py", score=0.9, excerpt="a") - ] - - result = engine.staged_cascade_search( - "query", temp_paths / "src" - ) - - # Check for stage stats in errors field - stage_stats = None - for err in result.stats.errors: - if err.startswith("STAGE_STATS:"): - stage_stats = json.loads(err.replace("STAGE_STATS:", "")) - break - - assert stage_stats is not None - assert "stage_times" in stage_stats - assert "stage_counts" in stage_stats - assert "stage1_binary_ms" in stage_stats["stage_times"] - assert "stage1_candidates" in stage_stats["stage_counts"] - - def test_staged_cascade_with_rerank_enabled( - self, mock_registry, mock_mapper, temp_paths - ): - """Test staged_cascade_search with reranking enabled.""" - config = MagicMock(spec=Config) - config.cascade_coarse_k = 100 - config.cascade_fine_k = 10 - config.enable_staged_rerank = True - config.staged_clustering_strategy = "auto" - config.graph_expansion_depth = 2 - - engine = ChainSearchEngine(mock_registry, mock_mapper, config=config) - - with patch.object(engine, "_find_start_index") as mock_find: - mock_find.return_value = temp_paths / "index" / "_index.db" - - with patch.object(engine, "_collect_index_paths") as mock_collect: - mock_collect.return_value = [temp_paths / "index" / "_index.db"] - - with patch.object(engine, "_stage1_binary_search") as mock_stage1: - mock_stage1.return_value = ( - [SearchResult(path="a.py", score=0.9, excerpt="a")], - temp_paths / "index", - ) - - with patch.object(engine, "_stage2_lsp_expand") as mock_stage2: - mock_stage2.return_value = [ - SearchResult(path="a.py", score=0.9, excerpt="a") - ] - - with patch.object(engine, "_stage3_cluster_prune") as mock_stage3: - mock_stage3.return_value = [ - SearchResult(path="a.py", score=0.9, excerpt="a") - ] - - with patch.object(engine, "_stage4_optional_rerank") as mock_stage4: - mock_stage4.return_value = [ - SearchResult(path="a.py", score=0.95, excerpt="a") - ] - - result = engine.staged_cascade_search( - "query", temp_paths / "src" - ) - - # Verify stage 4 was called - mock_stage4.assert_called_once() - - def test_staged_cascade_fallback_to_search( - self, mock_registry, mock_mapper, mock_config, temp_paths - ): - """Test staged_cascade_search falls back to standard search when numpy unavailable.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch("codexlens.search.chain_search.NUMPY_AVAILABLE", False): - with patch.object(engine, "search") as mock_search: - mock_search.return_value = MagicMock() - - engine.staged_cascade_search("query", temp_paths / "src") - - # Should fall back to standard search - mock_search.assert_called_once() - - def test_staged_cascade_deduplicates_final_results( - self, mock_registry, mock_mapper, mock_config, temp_paths - ): - """Test staged_cascade_search deduplicates results by path.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch.object(engine, "_find_start_index") as mock_find: - mock_find.return_value = temp_paths / "index" / "_index.db" - - with patch.object(engine, "_collect_index_paths") as mock_collect: - mock_collect.return_value = [temp_paths / "index" / "_index.db"] - - with patch.object(engine, "_stage1_binary_search") as mock_stage1: - mock_stage1.return_value = ( - [SearchResult(path="a.py", score=0.9, excerpt="a")], - temp_paths / "index", - ) - - with patch.object(engine, "_stage2_lsp_expand") as mock_stage2: - mock_stage2.return_value = [ - SearchResult(path="a.py", score=0.9, excerpt="a") - ] - - with patch.object(engine, "_stage3_cluster_prune") as mock_stage3: - # Return duplicates with different scores - mock_stage3.return_value = [ - SearchResult(path="a.py", score=0.9, excerpt="a"), - SearchResult(path="a.py", score=0.8, excerpt="a duplicate"), - SearchResult(path="b.py", score=0.7, excerpt="b"), - ] - - result = engine.staged_cascade_search( - "query", temp_paths / "src", k=10 - ) - - # Should deduplicate a.py (keep higher score) - paths = [r.path for r in result.results] - assert len(paths) == len(set(paths)) - # a.py should have score 0.9 - a_result = next(r for r in result.results if r.path == "a.py") - assert a_result.score == 0.9 - - def test_staged_cascade_expands_stage3_target_for_rerank_budget( - self, mock_registry, mock_mapper, temp_paths - ): - """Test staged cascade preserves enough Stage 3 reps for rerank budget.""" - config = Config(data_dir=temp_paths / "data") - config.enable_staged_rerank = True - config.reranker_top_k = 6 - config.reranking_top_k = 6 - - engine = ChainSearchEngine(mock_registry, mock_mapper, config=config) - expanded_results = [ - SearchResult(path=f"src/file-{index}.ts", score=1.0 - (index * 0.01), excerpt="x") - for index in range(8) - ] - - with patch.object(engine, "_find_start_index") as mock_find: - mock_find.return_value = temp_paths / "index" / "_index.db" - - with patch.object(engine, "_collect_index_paths") as mock_collect: - mock_collect.return_value = [temp_paths / "index" / "_index.db"] - - with patch.object(engine, "_stage1_binary_search") as mock_stage1: - mock_stage1.return_value = ( - [SearchResult(path="seed.ts", score=0.9, excerpt="seed")], - temp_paths / "index", - ) - - with patch.object(engine, "_stage2_lsp_expand") as mock_stage2: - mock_stage2.return_value = expanded_results - - with patch.object(engine, "_stage3_cluster_prune") as mock_stage3: - mock_stage3.return_value = expanded_results[:6] - - with patch.object(engine, "_stage4_optional_rerank") as mock_stage4: - mock_stage4.return_value = expanded_results[:2] - - engine.staged_cascade_search( - "query", - temp_paths / "src", - k=2, - coarse_k=20, - ) - - mock_stage3.assert_called_once_with( - expanded_results, - 6, - query="query", - ) - - def test_staged_cascade_overfetches_rerank_before_final_trim( - self, mock_registry, mock_mapper, temp_paths - ): - """Test staged rerank keeps enough candidates for path penalties to work.""" - config = Config(data_dir=temp_paths / "data") - config.enable_staged_rerank = True - config.reranker_top_k = 4 - config.reranking_top_k = 4 - config.test_file_penalty = 0.15 - config.generated_file_penalty = 0.35 - - engine = ChainSearchEngine(mock_registry, mock_mapper, config=config) - - src_primary = str(temp_paths / "src" / "tools" / "smart-search.ts") - src_secondary = str(temp_paths / "src" / "tools" / "codex-lens.ts") - test_primary = str(temp_paths / "tests" / "integration" / "cli-routes.test.ts") - test_secondary = str( - temp_paths / "frontend" / "tests" / "e2e" / "prompt-memory.spec.ts" - ) - query = "parse CodexLens JSON output strip ANSI smart_search" - clustered_results = [ - SearchResult(path=test_primary, score=0.98, excerpt="test"), - SearchResult(path=test_secondary, score=0.97, excerpt="test"), - SearchResult(path=src_primary, score=0.96, excerpt="source"), - SearchResult(path=src_secondary, score=0.95, excerpt="source"), - ] - - with patch.object(engine, "_find_start_index") as mock_find: - mock_find.return_value = temp_paths / "index" / "_index.db" - - with patch.object(engine, "_collect_index_paths") as mock_collect: - mock_collect.return_value = [temp_paths / "index" / "_index.db"] - - with patch.object(engine, "_stage1_binary_search") as mock_stage1: - mock_stage1.return_value = ( - [SearchResult(path=src_primary, score=0.9, excerpt="seed")], - temp_paths / "index", - ) - - with patch.object(engine, "_stage2_lsp_expand") as mock_stage2: - mock_stage2.return_value = clustered_results - - with patch.object(engine, "_stage3_cluster_prune") as mock_stage3: - mock_stage3.return_value = clustered_results - - with patch.object(engine, "_cross_encoder_rerank") as mock_rerank: - mock_rerank.return_value = clustered_results - - result = engine.staged_cascade_search( - query, - temp_paths / "src", - k=2, - coarse_k=20, - ) - - mock_rerank.assert_called_once_with(query, clustered_results, 4) - assert [item.path for item in result.results] == [src_primary, src_secondary] - - -# ============================================================================= -# Graceful Degradation Tests -# ============================================================================= - - -class TestStagedCascadeGracefulDegradation: - """Tests for graceful degradation when dependencies unavailable.""" - - def test_falls_back_when_clustering_unavailable( - self, mock_registry, mock_mapper, mock_config, sample_expanded_results - ): - """Test clustering stage falls back gracefully when clustering unavailable.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch.object(engine, "_get_embeddings_for_clustering") as mock_embed: - mock_embed.return_value = None - - clustered = engine._stage3_cluster_prune( - sample_expanded_results, target_count=3 - ) - - # Should fall back to score-based selection - assert len(clustered) <= 3 - - def test_falls_back_when_graph_expander_unavailable( - self, mock_registry, mock_mapper, mock_config, sample_binary_results - ): - """Test LSP expansion falls back when GraphExpander unavailable.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - # Patch the import inside the method - with patch("codexlens.search.graph_expander.GraphExpander", side_effect=ImportError): - expanded = engine._stage2_lsp_expand( - sample_binary_results, index_root=Path("/fake") - ) - - # Should return original results - assert expanded == sample_binary_results - - def test_handles_stage_failures_gracefully( - self, mock_registry, mock_mapper, mock_config, temp_paths - ): - """Test staged pipeline handles stage failures gracefully.""" - engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config) - - with patch.object(engine, "_find_start_index") as mock_find: - mock_find.return_value = temp_paths / "index" / "_index.db" - - with patch.object(engine, "_collect_index_paths") as mock_collect: - mock_collect.return_value = [temp_paths / "index" / "_index.db"] - - with patch.object(engine, "_stage1_binary_search") as mock_stage1: - # Stage 1 returns no results - mock_stage1.return_value = ([], None) - - with patch.object(engine, "search") as mock_search: - mock_search.return_value = MagicMock() - - engine.staged_cascade_search("query", temp_paths / "src") - - # Should fall back to standard search when stage 1 fails - mock_search.assert_called_once() diff --git a/codex-lens/tests/test_staged_cascade_lsp_depth.py b/codex-lens/tests/test_staged_cascade_lsp_depth.py deleted file mode 100644 index b7437ec8..00000000 --- a/codex-lens/tests/test_staged_cascade_lsp_depth.py +++ /dev/null @@ -1,168 +0,0 @@ -"""Regression tests for staged cascade Stage 2 expansion depth. - -Staged cascade is documented as: - coarse (binary) → LSP/graph expansion → clustering → optional rerank - -This test ensures Stage 2 respects Config.staged_lsp_depth (not unrelated -graph_expansion_depth settings). -""" - -from __future__ import annotations - -import tempfile -from pathlib import Path -from unittest.mock import patch - -import pytest - -from codexlens.config import Config -from codexlens.entities import CodeRelationship, RelationshipType, SearchResult, Symbol -from codexlens.search.chain_search import ChainSearchEngine -from codexlens.storage.dir_index import DirIndexStore -from codexlens.storage.index_tree import _compute_graph_neighbors -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - - -@pytest.fixture() -def temp_paths() -> Path: - tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) - root = Path(tmpdir.name) - yield root - try: - tmpdir.cleanup() - except (PermissionError, OSError): - pass - - -def _create_index_with_neighbors(root: Path) -> tuple[PathMapper, Path, Path, str]: - project_root = root / "project" - project_root.mkdir(parents=True, exist_ok=True) - - index_root = root / "indexes" - mapper = PathMapper(index_root=index_root) - index_db_path = mapper.source_to_index_db(project_root) - index_db_path.parent.mkdir(parents=True, exist_ok=True) - - # Use 3 files so staged_cascade_search's final "deduplicate by path" step - # doesn't collapse all expanded symbols into a single file result. - content_a = "\n".join(["def a():", " b()", ""]) - content_b = "\n".join(["def b():", " c()", ""]) - content_c = "\n".join(["def c():", " return 1", ""]) - - file_a = project_root / "a.py" - file_b = project_root / "b.py" - file_c = project_root / "c.py" - file_a.write_text(content_a, encoding="utf-8") - file_b.write_text(content_b, encoding="utf-8") - file_c.write_text(content_c, encoding="utf-8") - - symbols_a = [Symbol(name="a", kind="function", range=(1, 2), file=str(file_a))] - symbols_b = [Symbol(name="b", kind="function", range=(1, 2), file=str(file_b))] - symbols_c = [Symbol(name="c", kind="function", range=(1, 2), file=str(file_c))] - - relationships_a = [ - CodeRelationship( - source_symbol="a", - target_symbol="b", - relationship_type=RelationshipType.CALL, - source_file=str(file_a), - target_file=str(file_b), - source_line=2, - ) - ] - relationships_b = [ - CodeRelationship( - source_symbol="b", - target_symbol="c", - relationship_type=RelationshipType.CALL, - source_file=str(file_b), - target_file=str(file_c), - source_line=2, - ) - ] - - config = Config(data_dir=root / "data") - store = DirIndexStore(index_db_path, config=config) - store.initialize() - store.add_file( - name=file_a.name, - full_path=file_a, - content=content_a, - language="python", - symbols=symbols_a, - relationships=relationships_a, - ) - store.add_file( - name=file_b.name, - full_path=file_b, - content=content_b, - language="python", - symbols=symbols_b, - relationships=relationships_b, - ) - store.add_file( - name=file_c.name, - full_path=file_c, - content=content_c, - language="python", - symbols=symbols_c, - relationships=[], - ) - _compute_graph_neighbors(store) - store.close() - - return mapper, project_root, file_a, content_a - - -def test_staged_cascade_stage2_uses_staged_lsp_depth(temp_paths: Path) -> None: - mapper, project_root, file_path, content = _create_index_with_neighbors(temp_paths) - index_db_path = mapper.source_to_index_db(project_root) - - registry = RegistryStore(db_path=temp_paths / "registry.db") - registry.initialize() - - # Intentionally conflicting depths: staged_lsp_depth should win for staged cascade. - config = Config( - data_dir=temp_paths / "data", - staged_lsp_depth=1, - graph_expansion_depth=2, - enable_staged_rerank=False, - staged_clustering_strategy="noop", - ) - - engine = ChainSearchEngine(registry, mapper, config=config) - try: - base = SearchResult( - path=str(file_path.resolve()), - score=1.0, - excerpt="", - content=content, - start_line=1, - end_line=2, - symbol_name="a", - symbol_kind="function", - ) - - with patch("codexlens.search.chain_search.NUMPY_AVAILABLE", True): - with patch.object(engine, "_find_start_index", return_value=index_db_path): - with patch.object(engine, "_collect_index_paths", return_value=[index_db_path]): - # Bypass binary vector infrastructure; Stage 1 output is sufficient for Stage 2 behavior. - with patch.object( - engine, - "_stage1_binary_search", - return_value=([base], index_db_path.parent), - ): - result = engine.staged_cascade_search( - query="test", - source_path=project_root, - k=3, - coarse_k=10, - ) - - symbol_names = {r.symbol_name for r in result.results if r.symbol_name} - assert "b" in symbol_names - # With staged_lsp_depth=1, Stage 2 should NOT include 2-hop neighbor "c". - assert "c" not in symbol_names - finally: - engine.close() diff --git a/codex-lens/tests/test_staged_cascade_realtime_lsp.py b/codex-lens/tests/test_staged_cascade_realtime_lsp.py deleted file mode 100644 index 83fb6860..00000000 --- a/codex-lens/tests/test_staged_cascade_realtime_lsp.py +++ /dev/null @@ -1,98 +0,0 @@ -"""Unit tests for staged cascade Stage 2 realtime LSP graph expansion. - -These tests mock out the live LSP components (LspBridge + LspGraphBuilder) -so they can run without external language servers installed. -""" - -from __future__ import annotations - -from pathlib import Path -from unittest.mock import AsyncMock, MagicMock, patch - -from codexlens.config import Config -from codexlens.entities import SearchResult -from codexlens.hybrid_search.data_structures import CodeAssociationGraph, CodeSymbolNode, Range -from codexlens.search.chain_search import ChainSearchEngine -from codexlens.storage.path_mapper import PathMapper -from codexlens.storage.registry import RegistryStore - - -class _DummyBridge: - def __init__(self, *args, **kwargs) -> None: - pass - - async def get_document_symbols(self, file_path: str): - _ = file_path - return [] - - async def __aenter__(self): - return self - - async def __aexit__(self, exc_type, exc, tb) -> None: - return None - - -def test_stage2_realtime_mode_expands_and_combines(tmp_path: Path) -> None: - registry = RegistryStore(db_path=tmp_path / "registry.db") - registry.initialize() - mapper = PathMapper(index_root=tmp_path / "indexes") - - config = Config( - data_dir=tmp_path / "data", - staged_stage2_mode="realtime", - staged_lsp_depth=1, - staged_realtime_lsp_timeout_s=1.0, - staged_realtime_lsp_max_nodes=10, - staged_realtime_lsp_warmup_s=0.0, - ) - - engine = ChainSearchEngine(registry, mapper, config=config) - try: - coarse = [ - SearchResult( - path=str(tmp_path / "a.py"), - score=1.0, - excerpt="def a(): pass", - content="def a():\n pass\n", - symbol_name="a", - symbol_kind="function", - start_line=1, - end_line=2, - ) - ] - - graph = CodeAssociationGraph() - seed_id = f"{coarse[0].path}:a:1" - graph.nodes[seed_id] = CodeSymbolNode( - id=seed_id, - name="a", - kind="function", - file_path=coarse[0].path, - range=Range(start_line=1, start_character=1, end_line=2, end_character=1), - ) - related_id = f"{str(tmp_path / 'b.py')}:b:1" - graph.nodes[related_id] = CodeSymbolNode( - id=related_id, - name="b", - kind="function", - file_path=str(tmp_path / "b.py"), - range=Range(start_line=1, start_character=1, end_line=1, end_character=1), - raw_code="def b():\n return 1\n", - ) - - dummy_builder = MagicMock() - dummy_builder.build_from_seeds = AsyncMock(return_value=graph) - - with patch("codexlens.lsp.LspBridge", _DummyBridge): - with patch("codexlens.lsp.LspGraphBuilder", return_value=dummy_builder) as mock_builder: - # Avoid needing a real index_to_source mapping - engine.mapper.index_to_source = MagicMock(return_value=tmp_path) - expanded = engine._stage2_lsp_expand(coarse, index_root=tmp_path / "fake_index_root") - - assert mock_builder.call_args is not None - assert mock_builder.call_args.kwargs.get("resolve_symbols") is False - names = {r.symbol_name for r in expanded if r.symbol_name} - assert "a" in names - assert "b" in names - finally: - engine.close() diff --git a/codex-lens/tests/test_staged_stage1_fallback_seed.py b/codex-lens/tests/test_staged_stage1_fallback_seed.py deleted file mode 100644 index ff9ea061..00000000 --- a/codex-lens/tests/test_staged_stage1_fallback_seed.py +++ /dev/null @@ -1,49 +0,0 @@ -from __future__ import annotations - -import json -from pathlib import Path -from unittest.mock import MagicMock - -from codexlens.config import Config -from codexlens.entities import SearchResult -from codexlens.search.chain_search import ChainSearchEngine, ChainSearchResult, SearchOptions - - -def _extract_stage_stats(result: ChainSearchResult) -> dict: - for item in result.stats.errors or []: - if isinstance(item, str) and item.startswith("STAGE_STATS:"): - return json.loads(item[len("STAGE_STATS:") :]) - raise AssertionError("missing STAGE_STATS payload") - - -def test_staged_pipeline_seeds_from_fts_when_stage1_empty(monkeypatch) -> None: - cfg = Config.load() - cfg.enable_staged_rerank = False - cfg.staged_stage2_mode = "realtime" # ensure we pass through stage2 wrapper - cfg.staged_clustering_strategy = "score" - - engine = ChainSearchEngine(registry=MagicMock(), mapper=MagicMock(), config=cfg) - - # Avoid touching registry/mapper/index stores. - monkeypatch.setattr(engine, "_find_start_index", lambda *_a, **_k: Path("X:/fake/_index.db")) - monkeypatch.setattr(engine, "_collect_index_paths", lambda *_a, **_k: [Path("X:/fake/_index.db")]) - - # Force Stage 1 to return empty so the FTS seeding path is exercised. - monkeypatch.setattr(engine, "_stage1_binary_search", lambda *_a, **_k: ([], Path("X:/fake"))) - - seed_results = [SearchResult(path="D:/p/a.py", score=1.0), SearchResult(path="D:/p/b.py", score=0.9)] - - # Provide a stable SearchStats instance for the fallback search call. - from codexlens.search.chain_search import SearchStats - - monkeypatch.setattr(engine, "search", lambda *_a, **_k: ChainSearchResult(query="q", results=seed_results, symbols=[], stats=SearchStats())) - - # Make later stages no-ops so we only validate plumbing. - monkeypatch.setattr(engine, "_stage2_lsp_expand", lambda results, *_a, **_k: results) - monkeypatch.setattr(engine, "_stage3_cluster_prune", lambda results, *_a, **_k: results) - - result = engine.staged_cascade_search("q", Path("."), k=2, coarse_k=5, options=SearchOptions()) - stage_stats = _extract_stage_stats(result) - - assert stage_stats["stage_counts"].get("stage1_fallback_used") == 1 - assert result.results and [r.path for r in result.results] == ["D:/p/a.py", "D:/p/b.py"] diff --git a/codex-lens/tests/test_staged_stage3_fast_strategies.py b/codex-lens/tests/test_staged_stage3_fast_strategies.py deleted file mode 100644 index b546a939..00000000 --- a/codex-lens/tests/test_staged_stage3_fast_strategies.py +++ /dev/null @@ -1,56 +0,0 @@ -from __future__ import annotations - -from unittest.mock import MagicMock - -import pytest - -from codexlens.config import Config -from codexlens.entities import SearchResult -from codexlens.search.chain_search import ChainSearchEngine - - -def _engine_with_strategy(name: str) -> ChainSearchEngine: - cfg = Config.load() - cfg.staged_clustering_strategy = name - return ChainSearchEngine(registry=MagicMock(), mapper=MagicMock(), config=cfg) - - -def test_stage3_strategy_score_skips_embedding(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setattr( - "codexlens.semantic.factory.get_embedder", - lambda *a, **k: (_ for _ in ()).throw(RuntimeError("should not embed")), - ) - - engine = _engine_with_strategy("score") - expanded = [ - SearchResult(path="D:/p/a.py", score=0.9), - SearchResult(path="D:/p/a.py", score=0.1), - SearchResult(path="D:/p/b.py", score=0.8), - SearchResult(path="D:/p/c.py", score=0.7), - ] - - reps = engine._stage3_cluster_prune(expanded, target_count=3) - assert [r.path for r in reps] == ["D:/p/a.py", "D:/p/b.py", "D:/p/c.py"] - - -def test_stage3_strategy_dir_rr_round_robins_dirs(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setattr( - "codexlens.semantic.factory.get_embedder", - lambda *a, **k: (_ for _ in ()).throw(RuntimeError("should not embed")), - ) - - engine = _engine_with_strategy("dir_rr") - expanded = [ - SearchResult(path="D:/p1/a.py", score=0.99), - SearchResult(path="D:/p1/b.py", score=0.98), - SearchResult(path="D:/p2/c.py", score=0.97), - SearchResult(path="D:/p2/d.py", score=0.96), - SearchResult(path="D:/p3/e.py", score=0.95), - ] - - reps = engine._stage3_cluster_prune(expanded, target_count=4) - assert len(reps) == 4 - assert reps[0].path.endswith("p1/a.py") - assert reps[1].path.endswith("p2/c.py") - assert reps[2].path.endswith("p3/e.py") - diff --git a/codex-lens/tests/test_standalone_lsp_manager_open_document_cache.py b/codex-lens/tests/test_standalone_lsp_manager_open_document_cache.py deleted file mode 100644 index 8af8ad75..00000000 --- a/codex-lens/tests/test_standalone_lsp_manager_open_document_cache.py +++ /dev/null @@ -1,87 +0,0 @@ -from __future__ import annotations - -import asyncio -import time -from pathlib import Path -from types import SimpleNamespace -from unittest.mock import AsyncMock, MagicMock - -import pytest - -from codexlens.lsp.standalone_manager import ServerConfig, ServerState, StandaloneLspManager - - -@pytest.mark.asyncio -async def test_open_document_skips_when_unchanged(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - target = tmp_path / "a.py" - target.write_text("print('hi')\n", encoding="utf-8") - - manager = StandaloneLspManager(workspace_root=str(tmp_path)) - # Make language detection deterministic. - manager._extension_map["py"] = "python" # type: ignore[attr-defined] - - cfg = ServerConfig( - language_id="python", - display_name="Pyright", - extensions=["py"], - command=["pyright-langserver", "--stdio"], - ) - - # ServerState requires reader/writer/process, but _open_document only uses writer via _send_notification. - dummy_process = SimpleNamespace(returncode=None) - dummy_reader = asyncio.StreamReader() - dummy_writer = MagicMock() - state = ServerState(config=cfg, process=dummy_process, reader=dummy_reader, writer=dummy_writer) - - sent: list[str] = [] - - async def _send_notification(_state, method: str, _params): - sent.append(method) - - monkeypatch.setattr(manager, "_send_notification", _send_notification) - - await manager._open_document(state, str(target)) # type: ignore[attr-defined] - await manager._open_document(state, str(target)) # unchanged: should be skipped - - assert sent.count("textDocument/didOpen") == 1 - assert "textDocument/didChange" not in sent - - -@pytest.mark.asyncio -async def test_open_document_sends_did_change_on_mtime_change(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - target = tmp_path / "a.py" - target.write_text("print('hi')\n", encoding="utf-8") - - manager = StandaloneLspManager(workspace_root=str(tmp_path)) - manager._extension_map["py"] = "python" # type: ignore[attr-defined] - - cfg = ServerConfig( - language_id="python", - display_name="Pyright", - extensions=["py"], - command=["pyright-langserver", "--stdio"], - ) - - dummy_process = SimpleNamespace(returncode=None) - dummy_reader = asyncio.StreamReader() - dummy_writer = MagicMock() - state = ServerState(config=cfg, process=dummy_process, reader=dummy_reader, writer=dummy_writer) - - sent: list[str] = [] - - async def _send_notification(_state, method: str, _params): - sent.append(method) - - monkeypatch.setattr(manager, "_send_notification", _send_notification) - - await manager._open_document(state, str(target)) # type: ignore[attr-defined] - - # Ensure filesystem mtime changes (Windows can have coarse resolution). - time.sleep(0.02) - target.write_text("print('changed')\n", encoding="utf-8") - - await manager._open_document(state, str(target)) # changed -> didChange - - assert sent.count("textDocument/didOpen") == 1 - assert sent.count("textDocument/didChange") == 1 - diff --git a/codex-lens/tests/test_static_graph_integration.py b/codex-lens/tests/test_static_graph_integration.py deleted file mode 100644 index 2dfb1357..00000000 --- a/codex-lens/tests/test_static_graph_integration.py +++ /dev/null @@ -1,289 +0,0 @@ -"""Tests for static graph relationship writing during index build (T2). - -Verifies that IndexTreeBuilder._build_single_dir and _build_dir_worker -correctly write relationships to GlobalSymbolIndex when -config.static_graph_enabled is True. -""" - -import tempfile -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest - -from codexlens.config import Config -from codexlens.entities import ( - CodeRelationship, - IndexedFile, - RelationshipType, - Symbol, -) -from codexlens.storage.global_index import GlobalSymbolIndex - - -@pytest.fixture() -def temp_dir(): - tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) - yield Path(tmpdir.name) - try: - tmpdir.cleanup() - except (PermissionError, OSError): - pass - - -def _make_indexed_file(file_path: str) -> IndexedFile: - """Create a test IndexedFile with symbols and relationships.""" - return IndexedFile( - path=file_path, - language="python", - symbols=[ - Symbol(name="MyClass", kind="class", range=(1, 20)), - Symbol(name="helper", kind="function", range=(22, 30)), - ], - relationships=[ - CodeRelationship( - source_symbol="MyClass", - target_symbol="BaseClass", - relationship_type=RelationshipType.INHERITS, - source_file=file_path, - target_file="other/base.py", - source_line=1, - ), - CodeRelationship( - source_symbol="MyClass", - target_symbol="os", - relationship_type=RelationshipType.IMPORTS, - source_file=file_path, - source_line=2, - ), - CodeRelationship( - source_symbol="helper", - target_symbol="external_func", - relationship_type=RelationshipType.CALL, - source_file=file_path, - source_line=25, - ), - ], - ) - - -def test_build_single_dir_writes_global_relationships_when_enabled(temp_dir: Path) -> None: - """When static_graph_enabled=True, relationships should be written to global index.""" - from codexlens.storage.index_tree import IndexTreeBuilder - - config = Config( - data_dir=temp_dir / "data", - static_graph_enabled=True, - static_graph_relationship_types=["imports", "inherits"], - global_symbol_index_enabled=True, - ) - - # Set up real GlobalSymbolIndex - global_db_path = temp_dir / "global_symbols.db" - global_index = GlobalSymbolIndex(global_db_path, project_id=1) - global_index.initialize() - - # Create a source file - src_dir = temp_dir / "src" - src_dir.mkdir() - test_file = src_dir / "module.py" - test_file.write_text("class MyClass(BaseClass):\n pass\n", encoding="utf-8") - - indexed_file = _make_indexed_file(str(test_file)) - - # Mock parser to return our test IndexedFile - mock_parser = MagicMock() - mock_parser.parse.return_value = indexed_file - - mock_mapper = MagicMock() - mock_mapper.source_to_index_db.return_value = temp_dir / "index" / "_index.db" - - mock_registry = MagicMock() - - builder = IndexTreeBuilder(mock_registry, mock_mapper, config=config, incremental=False) - builder.parser_factory = MagicMock() - builder.parser_factory.get_parser.return_value = mock_parser - - result = builder._build_single_dir( - src_dir, - languages=None, - project_id=1, - global_index_db_path=global_db_path, - ) - - assert result.error is None - assert result.files_count == 1 - - # Verify relationships were written to global index - # Only IMPORTS and INHERITS should be written (not CALL) - rels = global_index.query_by_target("BaseClass", prefix_mode=True) - rels += global_index.query_by_target("os", prefix_mode=True) - assert len(rels) >= 1, "Expected at least 1 relationship written to global index" - - # CALL relationship for external_func should NOT be present - call_rels = global_index.query_by_target("external_func", prefix_mode=True) - assert len(call_rels) == 0, "CALL relationships should not be written" - - global_index.close() - - -def test_build_single_dir_skips_relationships_when_disabled(temp_dir: Path) -> None: - """When static_graph_enabled=False, no relationships should be written.""" - from codexlens.storage.index_tree import IndexTreeBuilder - - config = Config( - data_dir=temp_dir / "data", - static_graph_enabled=False, - global_symbol_index_enabled=True, - ) - - global_db_path = temp_dir / "global_symbols.db" - global_index = GlobalSymbolIndex(global_db_path, project_id=1) - global_index.initialize() - - src_dir = temp_dir / "src" - src_dir.mkdir() - test_file = src_dir / "module.py" - test_file.write_text("import os\n", encoding="utf-8") - - indexed_file = _make_indexed_file(str(test_file)) - - mock_parser = MagicMock() - mock_parser.parse.return_value = indexed_file - - mock_mapper = MagicMock() - mock_mapper.source_to_index_db.return_value = temp_dir / "index" / "_index.db" - - mock_registry = MagicMock() - - builder = IndexTreeBuilder(mock_registry, mock_mapper, config=config, incremental=False) - builder.parser_factory = MagicMock() - builder.parser_factory.get_parser.return_value = mock_parser - - result = builder._build_single_dir( - src_dir, - languages=None, - project_id=1, - global_index_db_path=global_db_path, - ) - - assert result.error is None - - # No relationships should be in global index - conn = global_index._get_connection() - count = conn.execute("SELECT COUNT(*) FROM global_relationships").fetchone()[0] - assert count == 0, "No relationships should be written when static_graph_enabled=False" - - global_index.close() - - -def test_relationship_write_failure_does_not_block_indexing(temp_dir: Path) -> None: - """If global_index.update_file_relationships raises, file indexing continues.""" - from codexlens.storage.index_tree import IndexTreeBuilder - - config = Config( - data_dir=temp_dir / "data", - static_graph_enabled=True, - static_graph_relationship_types=["imports", "inherits"], - global_symbol_index_enabled=True, - ) - - src_dir = temp_dir / "src" - src_dir.mkdir() - test_file = src_dir / "module.py" - test_file.write_text("import os\n", encoding="utf-8") - - indexed_file = _make_indexed_file(str(test_file)) - - mock_parser = MagicMock() - mock_parser.parse.return_value = indexed_file - - mock_mapper = MagicMock() - mock_mapper.source_to_index_db.return_value = temp_dir / "index" / "_index.db" - - mock_registry = MagicMock() - - # Create a mock GlobalSymbolIndex that fails on update_file_relationships - mock_global_db_path = temp_dir / "global_symbols.db" - - builder = IndexTreeBuilder(mock_registry, mock_mapper, config=config, incremental=False) - builder.parser_factory = MagicMock() - builder.parser_factory.get_parser.return_value = mock_parser - - # Patch GlobalSymbolIndex so update_file_relationships raises - with patch("codexlens.storage.index_tree.GlobalSymbolIndex") as MockGSI: - mock_gsi_instance = MagicMock() - mock_gsi_instance.update_file_relationships.side_effect = RuntimeError("DB locked") - MockGSI.return_value = mock_gsi_instance - - result = builder._build_single_dir( - src_dir, - languages=None, - project_id=1, - global_index_db_path=mock_global_db_path, - ) - - # File should still be indexed despite relationship write failure - assert result.error is None - assert result.files_count == 1 - - -def test_only_configured_relationship_types_written(temp_dir: Path) -> None: - """Only relationship types in static_graph_relationship_types should be written.""" - from codexlens.storage.index_tree import IndexTreeBuilder - - # Only allow 'imports' (not 'inherits') - config = Config( - data_dir=temp_dir / "data", - static_graph_enabled=True, - static_graph_relationship_types=["imports"], - global_symbol_index_enabled=True, - ) - - global_db_path = temp_dir / "global_symbols.db" - global_index = GlobalSymbolIndex(global_db_path, project_id=1) - global_index.initialize() - - src_dir = temp_dir / "src" - src_dir.mkdir() - test_file = src_dir / "module.py" - test_file.write_text("import os\nclass Foo(Bar): pass\n", encoding="utf-8") - - indexed_file = _make_indexed_file(str(test_file)) - - mock_parser = MagicMock() - mock_parser.parse.return_value = indexed_file - - mock_mapper = MagicMock() - mock_mapper.source_to_index_db.return_value = temp_dir / "index" / "_index.db" - - mock_registry = MagicMock() - - builder = IndexTreeBuilder(mock_registry, mock_mapper, config=config, incremental=False) - builder.parser_factory = MagicMock() - builder.parser_factory.get_parser.return_value = mock_parser - - result = builder._build_single_dir( - src_dir, - languages=None, - project_id=1, - global_index_db_path=global_db_path, - ) - - assert result.error is None - - # Only IMPORTS should be written - conn = global_index._get_connection() - rows = conn.execute( - "SELECT relationship_type FROM global_relationships" - ).fetchall() - - rel_types = {row[0] for row in rows} - assert "imports" in rel_types or len(rows) == 0 or rel_types == {"imports"}, \ - f"Expected only 'imports', got {rel_types}" - # INHERITS should NOT be present - assert "inherits" not in rel_types, "inherits should not be written when not in config" - # CALL should NOT be present - assert "calls" not in rel_types, "calls should not be written" - - global_index.close() diff --git a/codex-lens/tests/test_storage.py b/codex-lens/tests/test_storage.py deleted file mode 100644 index 2e07ceac..00000000 --- a/codex-lens/tests/test_storage.py +++ /dev/null @@ -1,534 +0,0 @@ -"""Tests for CodexLens storage.""" - -import sqlite3 -import threading -import pytest -import tempfile -from pathlib import Path - -from codexlens.storage.sqlite_store import SQLiteStore -from codexlens.entities import IndexedFile, Symbol -from codexlens.errors import StorageError - - -@pytest.fixture -def temp_db(): - """Create a temporary database for testing.""" - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "test.db" - store = SQLiteStore(db_path) - store.initialize() - yield store - store.close() - - -@pytest.fixture -def temp_db_path(): - """Create a temporary directory and return db path.""" - with tempfile.TemporaryDirectory() as tmpdir: - yield Path(tmpdir) / "test.db" - - -class TestSQLiteStore: - """Tests for SQLiteStore.""" - - def test_initialize(self, temp_db): - """Test database initialization.""" - stats = temp_db.stats() - assert stats["files"] == 0 - assert stats["symbols"] == 0 - - def test_fts_uses_external_content(self, temp_db): - """FTS should be configured as external-content to avoid duplication.""" - conn = temp_db._get_connection() - row = conn.execute( - "SELECT sql FROM sqlite_master WHERE type='table' AND name='files_fts'" - ).fetchone() - assert row is not None - assert "content='files'" in row["sql"] or "content=files" in row["sql"] - - def test_add_file(self, temp_db): - """Test adding a file to the index.""" - indexed_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=[ - Symbol(name="hello", kind="function", range=(1, 1)), - ], - chunks=[], - ) - temp_db.add_file(indexed_file, "def hello():\n pass") - - stats = temp_db.stats() - assert stats["files"] == 1 - assert stats["symbols"] == 1 - - def test_remove_file(self, temp_db): - """Test removing a file from the index.""" - indexed_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=[], - chunks=[], - ) - temp_db.add_file(indexed_file, "# test") - - assert temp_db.file_exists("/test/file.py") - assert temp_db.remove_file("/test/file.py") - assert not temp_db.file_exists("/test/file.py") - - def test_search_fts(self, temp_db): - """Test FTS search.""" - indexed_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=[], - chunks=[], - ) - temp_db.add_file(indexed_file, "def hello_world():\n print('hello')") - - results = temp_db.search_fts("hello") - assert len(results) == 1 - assert str(Path("/test/file.py").resolve()) == results[0].path - - def test_search_symbols(self, temp_db): - """Test symbol search.""" - indexed_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=[ - Symbol(name="hello_world", kind="function", range=(1, 1)), - Symbol(name="goodbye", kind="function", range=(3, 3)), - ], - chunks=[], - ) - temp_db.add_file(indexed_file, "def hello_world():\n pass\ndef goodbye():\n pass") - - results = temp_db.search_symbols("hello") - assert len(results) == 1 - assert results[0].name == "hello_world" - - def test_connection_reuse(self, temp_db): - """Test that connections are reused within the same thread.""" - conn1 = temp_db._get_connection() - conn2 = temp_db._get_connection() - assert conn1 is conn2 - - def test_migrate_legacy_fts_to_external(self, tmp_path): - """Existing databases should be migrated to external-content FTS.""" - db_path = tmp_path / "legacy.db" - with sqlite3.connect(db_path) as conn: - conn.execute( - """ - CREATE TABLE files ( - id INTEGER PRIMARY KEY, - path TEXT UNIQUE NOT NULL, - language TEXT NOT NULL, - content TEXT NOT NULL, - mtime REAL, - line_count INTEGER - ) - """ - ) - conn.execute( - """ - CREATE VIRTUAL TABLE files_fts USING fts5( - path UNINDEXED, - language UNINDEXED, - content - ) - """ - ) - conn.execute( - """ - INSERT INTO files(path, language, content, mtime, line_count) - VALUES(?, ?, ?, ?, ?) - """, - (str(Path("/test/file.py").resolve()), "python", "def hello():\n pass", None, 2), - ) - file_id = conn.execute("SELECT id FROM files").fetchone()[0] - conn.execute( - "INSERT INTO files_fts(rowid, path, language, content) VALUES(?, ?, ?, ?)", - (file_id, str(Path("/test/file.py").resolve()), "python", "def hello():\n pass"), - ) - conn.commit() - - store = SQLiteStore(db_path) - store.initialize() - try: - results = store.search_fts("hello") - assert len(results) == 1 - - conn = store._get_connection() - row = conn.execute( - "SELECT sql FROM sqlite_master WHERE type='table' AND name='files_fts'" - ).fetchone() - assert row is not None - assert "content='files'" in row["sql"] or "content=files" in row["sql"] - finally: - store.close() - - -class TestSQLiteStoreAddFiles: - """Tests for add_files batch operation.""" - - def test_add_files_batch(self, temp_db): - """Test adding multiple files in a batch.""" - files_data = [ - (IndexedFile( - path="/test/a.py", - language="python", - symbols=[Symbol(name="func_a", kind="function", range=(1, 1))], - ), "def func_a(): pass"), - (IndexedFile( - path="/test/b.py", - language="python", - symbols=[Symbol(name="func_b", kind="function", range=(1, 1))], - ), "def func_b(): pass"), - (IndexedFile( - path="/test/c.py", - language="python", - symbols=[Symbol(name="func_c", kind="function", range=(1, 1))], - ), "def func_c(): pass"), - ] - - temp_db.add_files(files_data) - - stats = temp_db.stats() - assert stats["files"] == 3 - assert stats["symbols"] == 3 - - def test_add_files_empty_list(self, temp_db): - """Test adding empty list of files.""" - temp_db.add_files([]) - stats = temp_db.stats() - assert stats["files"] == 0 - - -class TestSQLiteStoreSearch: - """Tests for search operations.""" - - def test_search_fts_with_limit(self, temp_db): - """Test FTS search with limit.""" - for i in range(10): - indexed_file = IndexedFile( - path=f"/test/file{i}.py", - language="python", - symbols=[], - ) - temp_db.add_file(indexed_file, f"def test{i}(): pass") - - results = temp_db.search_fts("test", limit=3) - assert len(results) <= 3 - - def test_search_fts_with_offset(self, temp_db): - """Test FTS search with offset.""" - for i in range(10): - indexed_file = IndexedFile( - path=f"/test/file{i}.py", - language="python", - symbols=[], - ) - temp_db.add_file(indexed_file, f"searchterm content {i}") - - results_page1 = temp_db.search_fts("searchterm", limit=3, offset=0) - results_page2 = temp_db.search_fts("searchterm", limit=3, offset=3) - - # Pages should be different - paths1 = {r.path for r in results_page1} - paths2 = {r.path for r in results_page2} - assert paths1.isdisjoint(paths2) - - def test_search_fts_no_results(self, temp_db): - """Test FTS search with no results.""" - indexed_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=[], - ) - temp_db.add_file(indexed_file, "def hello(): pass") - - results = temp_db.search_fts("nonexistent") - assert len(results) == 0 - - def test_search_symbols_by_kind(self, temp_db): - """Test symbol search filtered by kind.""" - indexed_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=[ - Symbol(name="MyClass", kind="class", range=(1, 5)), - Symbol(name="my_func", kind="function", range=(7, 10)), - Symbol(name="my_method", kind="method", range=(2, 4)), - ], - ) - temp_db.add_file(indexed_file, "class MyClass:\n def my_method(): pass\ndef my_func(): pass") - - # Search for functions only - results = temp_db.search_symbols("my", kind="function") - assert len(results) == 1 - assert results[0].name == "my_func" - - def test_search_symbols_with_limit(self, temp_db): - """Test symbol search with limit.""" - # Range starts from 1, not 0 - symbols = [Symbol(name=f"func{i}", kind="function", range=(i+1, i+1)) for i in range(20)] - indexed_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=symbols, - ) - temp_db.add_file(indexed_file, "# lots of functions") - - results = temp_db.search_symbols("func", limit=5) - assert len(results) == 5 - - def test_search_files_only(self, temp_db): - """Test search_files_only returns only paths.""" - indexed_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=[], - ) - temp_db.add_file(indexed_file, "def hello(): pass") - - results = temp_db.search_files_only("hello") - assert len(results) == 1 - assert isinstance(results[0], str) - - -class TestSQLiteStoreFileOperations: - """Tests for file operations.""" - - def test_file_exists_true(self, temp_db): - """Test file_exists returns True for existing file.""" - indexed_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=[], - ) - temp_db.add_file(indexed_file, "content") - - assert temp_db.file_exists("/test/file.py") - - def test_file_exists_false(self, temp_db): - """Test file_exists returns False for non-existing file.""" - assert not temp_db.file_exists("/nonexistent/file.py") - - def test_remove_nonexistent_file(self, temp_db): - """Test removing non-existent file returns False.""" - result = temp_db.remove_file("/nonexistent/file.py") - assert result is False - - def test_get_file_mtime(self, temp_db): - """Test getting file mtime.""" - indexed_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=[], - ) - temp_db.add_file(indexed_file, "content") - - # Note: mtime is only set if the file actually exists on disk - mtime = temp_db.get_file_mtime("/test/file.py") - # May be None if file doesn't exist on disk - assert mtime is None or isinstance(mtime, float) - - def test_get_file_mtime_nonexistent(self, temp_db): - """Test getting mtime for non-indexed file.""" - mtime = temp_db.get_file_mtime("/nonexistent/file.py") - assert mtime is None - - def test_update_existing_file(self, temp_db): - """Test updating an existing file.""" - indexed_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=[Symbol(name="old_func", kind="function", range=(1, 1))], - ) - temp_db.add_file(indexed_file, "def old_func(): pass") - - # Update with new content and symbols - updated_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=[Symbol(name="new_func", kind="function", range=(1, 1))], - ) - temp_db.add_file(updated_file, "def new_func(): pass") - - stats = temp_db.stats() - assert stats["files"] == 1 # Still one file - assert stats["symbols"] == 1 # Old symbols replaced - - symbols = temp_db.search_symbols("new_func") - assert len(symbols) == 1 - - -class TestSQLiteStoreStats: - """Tests for stats operation.""" - - def test_stats_empty_db(self, temp_db): - """Test stats on empty database.""" - stats = temp_db.stats() - assert stats["files"] == 0 - assert stats["symbols"] == 0 - assert stats["languages"] == {} - - def test_stats_with_data(self, temp_db): - """Test stats with data.""" - files = [ - (IndexedFile(path="/test/a.py", language="python", symbols=[ - Symbol(name="func1", kind="function", range=(1, 1)), - Symbol(name="func2", kind="function", range=(2, 2)), - ]), "content"), - (IndexedFile(path="/test/b.js", language="javascript", symbols=[ - Symbol(name="func3", kind="function", range=(1, 1)), - ]), "content"), - ] - temp_db.add_files(files) - - stats = temp_db.stats() - assert stats["files"] == 2 - assert stats["symbols"] == 3 - assert stats["languages"]["python"] == 1 - assert stats["languages"]["javascript"] == 1 - assert "db_path" in stats - - -class TestSQLiteStoreContextManager: - """Tests for context manager usage.""" - - def test_context_manager(self, temp_db_path): - """Test using SQLiteStore as context manager.""" - with SQLiteStore(temp_db_path) as store: - indexed_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=[], - ) - store.add_file(indexed_file, "content") - stats = store.stats() - assert stats["files"] == 1 - - -class TestSQLiteStoreThreadSafety: - """Tests for thread safety.""" - - def test_multiple_threads_read(self, temp_db): - """Test reading from multiple threads.""" - # Add some data first - indexed_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=[Symbol(name="test", kind="function", range=(1, 1))], - ) - temp_db.add_file(indexed_file, "def test(): pass") - - results = [] - errors = [] - - def read_data(): - try: - stats = temp_db.stats() - results.append(stats) - except Exception as e: - errors.append(e) - - threads = [threading.Thread(target=read_data) for _ in range(5)] - for t in threads: - t.start() - for t in threads: - t.join() - - assert len(errors) == 0 - assert len(results) == 5 - for stats in results: - assert stats["files"] == 1 - - -class TestSQLiteStoreEdgeCases: - """Edge case tests for SQLiteStore.""" - - def test_special_characters_in_path(self, temp_db): - """Test file path with special characters.""" - indexed_file = IndexedFile( - path="/test/file with spaces.py", - language="python", - symbols=[], - ) - temp_db.add_file(indexed_file, "content") - - assert temp_db.file_exists("/test/file with spaces.py") - - def test_unicode_content(self, temp_db): - """Test file with unicode content.""" - indexed_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=[Symbol(name="你好", kind="function", range=(1, 1))], - ) - temp_db.add_file(indexed_file, "def 你好(): print('世界')") - - symbols = temp_db.search_symbols("你好") - assert len(symbols) == 1 - - def test_very_long_content(self, temp_db): - """Test file with very long content.""" - long_content = "x = 1\n" * 10000 - indexed_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=[], - ) - temp_db.add_file(indexed_file, long_content) - - stats = temp_db.stats() - assert stats["files"] == 1 - - def test_file_with_no_symbols(self, temp_db): - """Test file with no symbols.""" - indexed_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=[], - ) - temp_db.add_file(indexed_file, "# just a comment") - - stats = temp_db.stats() - assert stats["files"] == 1 - assert stats["symbols"] == 0 - - def test_file_with_many_symbols(self, temp_db): - """Test file with many symbols.""" - # Range starts from 1, not 0 - symbols = [Symbol(name=f"func_{i}", kind="function", range=(i+1, i+1)) for i in range(100)] - indexed_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=symbols, - ) - temp_db.add_file(indexed_file, "# lots of functions") - - stats = temp_db.stats() - assert stats["symbols"] == 100 - - def test_close_and_reopen(self, temp_db_path): - """Test closing and reopening database.""" - # First session - store1 = SQLiteStore(temp_db_path) - store1.initialize() - indexed_file = IndexedFile( - path="/test/file.py", - language="python", - symbols=[Symbol(name="test", kind="function", range=(1, 1))], - ) - store1.add_file(indexed_file, "def test(): pass") - store1.close() - - # Second session - store2 = SQLiteStore(temp_db_path) - store2.initialize() - stats = store2.stats() - assert stats["files"] == 1 - assert stats["symbols"] == 1 - store2.close() diff --git a/codex-lens/tests/test_storage_concurrency.py b/codex-lens/tests/test_storage_concurrency.py deleted file mode 100644 index 8a7b35f3..00000000 --- a/codex-lens/tests/test_storage_concurrency.py +++ /dev/null @@ -1,698 +0,0 @@ -"""Concurrency tests for CodexLens storage managers.""" - -from __future__ import annotations - -import threading -import time -import tempfile -from pathlib import Path - -import pytest - -from codexlens.entities import IndexedFile, Symbol -from codexlens.storage.dir_index import DirIndexStore -from codexlens.storage.registry import RegistryStore -from codexlens.storage.sqlite_store import SQLiteStore - - -@pytest.fixture(scope="module") -def populated_store(): - """Create a SQLiteStore populated with 1000+ files across multiple directories.""" - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "concurrency.db" - store = SQLiteStore(db_path) - store.initialize() - - files = [] - for i in range(1000): - path = f"/test/dir_{i % 25}/file_{i}.py" - content = f"# token_{i}\n\ndef func_{i}():\n return {i}\n" - symbols = [Symbol(name=f"func_{i}", kind="function", range=(1, 1))] - files.append((IndexedFile(path=path, language="python", symbols=symbols), content)) - - store.add_files(files) - yield store - store.close() - - -@pytest.fixture() -def registry_store(tmp_path): - """Create a RegistryStore in a temporary database with a single registered project.""" - db_path = tmp_path / "registry.db" - store = RegistryStore(db_path) - store.initialize() - store.register_project(source_root=tmp_path / "src", index_root=tmp_path / "idx") - yield store - store.close() - - -@pytest.fixture() -def dir_index_store(tmp_path): - """Create a DirIndexStore for concurrency tests.""" - db_path = tmp_path / "_index.db" - store = DirIndexStore(db_path) - store.initialize() - - # Seed a few entries for read tests - for i in range(10): - store.add_file( - name=f"file_{i}.py", - full_path=tmp_path / f"file_{i}.py", - content=f"# dir-index token_{i}\nprint({i})\n", - language="python", - symbols=[Symbol(name=f"sym_{i}", kind="function", range=(1, 1))], - ) - - yield store - store.close() - - -@pytest.fixture() -def writable_store(tmp_path): - """Create a fresh SQLiteStore for concurrent write tests.""" - db_path = tmp_path / "writes.db" - store = SQLiteStore(db_path) - store.initialize() - yield store - store.close() - - -class TestConcurrentReads: - """Concurrent read tests for storage managers.""" - - def test_concurrent_stats_same_query_consistent(self, populated_store): - """Concurrent reads from 10 threads accessing the same stats query.""" - results = [] - errors = [] - lock = threading.Lock() - - def worker(): - try: - stats = populated_store.stats() - with lock: - results.append(stats) - except Exception as exc: - with lock: - errors.append(exc) - - threads = [threading.Thread(target=worker) for _ in range(10)] - for t in threads: - t.start() - for t in threads: - t.join() - - assert not errors - assert len(results) == 10 - assert all(r["files"] == 1000 for r in results) - assert all(r["symbols"] == 1000 for r in results) - - def test_concurrent_file_exists_same_file(self, populated_store): - """Concurrent reads from 10 threads checking the same file path.""" - target = "/test/dir_0/file_0.py" - results = [] - errors = [] - lock = threading.Lock() - - def worker(): - try: - ok = populated_store.file_exists(target) - with lock: - results.append(ok) - except Exception as exc: - with lock: - errors.append(exc) - - threads = [threading.Thread(target=worker) for _ in range(10)] - for t in threads: - t.start() - for t in threads: - t.join() - - assert not errors - assert results == [True] * 10 - - def test_concurrent_search_fts_same_token_consistent(self, populated_store): - """Concurrent reads from 10 threads searching the same FTS token.""" - results = [] - errors = [] - lock = threading.Lock() - - def worker(): - try: - matches = populated_store.search_fts("token_42") - with lock: - results.append(len(matches)) - except Exception as exc: - with lock: - errors.append(exc) - - threads = [threading.Thread(target=worker) for _ in range(10)] - for t in threads: - t.start() - for t in threads: - t.join() - - assert not errors - assert results == [1] * 10 - - def test_concurrent_search_fts_different_tokens(self, populated_store): - """Concurrent reads from 20 threads searching different tokens.""" - results = {} - errors = [] - lock = threading.Lock() - - def worker(i: int): - try: - matches = populated_store.search_fts(f"token_{i}") - with lock: - results[i] = len(matches) - except Exception as exc: - with lock: - errors.append(exc) - - threads = [threading.Thread(target=worker, args=(i,)) for i in range(20)] - for t in threads: - t.start() - for t in threads: - t.join() - - assert not errors - assert len(results) == 20 - assert all(results[i] == 1 for i in range(20)) - - def test_connection_pool_thread_local_isolation(self, populated_store): - """Each thread should get a dedicated connection object.""" - conn_ids = [] - errors = [] - lock = threading.Lock() - - def worker(): - try: - conn = populated_store._get_connection() - with lock: - conn_ids.append(id(conn)) - except Exception as exc: - with lock: - errors.append(exc) - - threads = [threading.Thread(target=worker) for _ in range(10)] - for t in threads: - t.start() - for t in threads: - t.join() - - assert not errors - assert len(set(conn_ids)) == len(conn_ids) - - def test_connection_reuse_within_thread(self, populated_store): - """Connections should be reused within the same thread.""" - errors = [] - - def worker(): - try: - c1 = populated_store._get_connection() - c2 = populated_store._get_connection() - assert c1 is c2 - except Exception as exc: - errors.append(exc) - - threads = [threading.Thread(target=worker) for _ in range(10)] - for t in threads: - t.start() - for t in threads: - t.join() - - assert not errors - - def test_pool_cleanup_removes_dead_thread_connections(self, populated_store): - """cleanup_stale_connections should remove connections for terminated threads.""" - created = [] - lock = threading.Lock() - current_tid = threading.get_ident() - - def worker(): - conn = populated_store._get_connection() - with lock: - created.append(threading.get_ident()) - # allow the thread to end quickly - - threads = [threading.Thread(target=worker) for _ in range(15)] - for t in threads: - t.start() - for t in threads: - t.join() - - # Ensure pool has entries for the threads we created - assert len(populated_store._pool) >= 10 - - populated_store._cleanup_stale_connections() - # Main thread connection may remain active; all terminated thread connections must be removed. - assert all(tid not in populated_store._pool for tid in created) - assert set(populated_store._pool.keys()).issubset({current_tid}) - - def test_pool_size_respects_max_after_sequential_load(self, populated_store): - """Pool should stay within MAX_POOL_SIZE once stale threads are cleaned up.""" - max_pool_size = populated_store.MAX_POOL_SIZE - - def make_thread(): - def worker(): - populated_store._get_connection() - - t = threading.Thread(target=worker) - t.start() - t.join() - - # Create more than MAX_POOL_SIZE thread connections sequentially. - for _ in range(max_pool_size + 8): - make_thread() - - populated_store._cleanup_stale_connections() - assert len(populated_store._pool) <= max_pool_size - - def test_read_throughput_measurement(self, populated_store): - """Measure simple read throughput scaling by thread count.""" - target_paths = [f"/test/dir_{i % 25}/file_{i}.py" for i in range(200)] - - def run(thread_count: int) -> float: - per_thread = 200 - errors = [] - - def worker(offset: int): - try: - for j in range(per_thread): - populated_store.file_exists(target_paths[(offset + j) % len(target_paths)]) - except Exception as exc: - errors.append(exc) - - threads = [threading.Thread(target=worker, args=(i,)) for i in range(thread_count)] - start = time.time() - for t in threads: - t.start() - for t in threads: - t.join() - duration = max(time.time() - start, 1e-6) - - assert not errors - total_ops = thread_count * per_thread - return total_ops / duration - - qps_1 = run(1) - qps_5 = run(5) - qps_10 = run(10) - qps_20 = run(20) - - # Sanity: throughput is measurable (no zeros). Do not assert strict scaling - # due to platform/GIL variability. - assert qps_1 > 0 - assert qps_5 > 0 - assert qps_10 > 0 - assert qps_20 > 0 - - def test_registry_store_concurrent_list_projects(self, registry_store): - """RegistryStore should support concurrent read access across threads.""" - results = [] - errors = [] - lock = threading.Lock() - - def worker(): - try: - projects = registry_store.list_projects() - with lock: - results.append(len(projects)) - except Exception as exc: - with lock: - errors.append(exc) - - threads = [threading.Thread(target=worker) for _ in range(10)] - for t in threads: - t.start() - for t in threads: - t.join() - - assert not errors - assert results == [1] * 10 - - def test_dir_index_store_concurrent_list_files(self, dir_index_store): - """DirIndexStore should support concurrent read listing via its internal lock.""" - results = [] - errors = [] - lock = threading.Lock() - - def worker(): - try: - files = dir_index_store.list_files() - with lock: - results.append(len(files)) - except Exception as exc: - with lock: - errors.append(exc) - - threads = [threading.Thread(target=worker) for _ in range(10)] - for t in threads: - t.start() - for t in threads: - t.join() - - assert not errors - assert results == [10] * 10 - - -class TestConcurrentWrites: - """Concurrent write tests for SQLiteStore.""" - - def test_concurrent_inserts_commit_all_rows(self, writable_store): - """Concurrent inserts from 10 threads should commit all rows.""" - thread_count = 10 - files_per_thread = 10 - errors = [] - lock = threading.Lock() - - def worker(thread_index: int): - try: - for i in range(files_per_thread): - path = f"/write/thread_{thread_index}/file_{i}.py" - indexed_file = IndexedFile( - path=path, - language="python", - symbols=[Symbol(name=f"sym_{thread_index}_{i}", kind="function", range=(1, 1))], - ) - content = f"# write_token_{thread_index}_{i}\nprint({i})\n" - writable_store.add_file(indexed_file, content) - except Exception as exc: - with lock: - errors.append(exc) - - threads = [threading.Thread(target=worker, args=(i,)) for i in range(thread_count)] - for t in threads: - t.start() - for t in threads: - t.join() - - assert not errors - stats = writable_store.stats() - assert stats["files"] == thread_count * files_per_thread - assert stats["symbols"] == thread_count * files_per_thread - - def test_concurrent_updates_same_file_serializes(self, writable_store): - """Concurrent updates to the same file should serialize and not lose writes.""" - target_path = "/write/shared.py" - base = IndexedFile( - path=target_path, - language="python", - symbols=[Symbol(name="base", kind="function", range=(1, 1))], - ) - writable_store.add_file(base, "print('base')\n") - - update_contents = [] - errors = [] - lock = threading.Lock() - - def worker(version: int): - try: - content = f"print('v{version}')\n" - indexed_file = IndexedFile( - path=target_path, - language="python", - symbols=[Symbol(name=f"v{version}", kind="function", range=(1, 1))], - ) - writable_store.add_file(indexed_file, content) - with lock: - update_contents.append(content) - except Exception as exc: - with lock: - errors.append(exc) - - threads = [threading.Thread(target=worker, args=(i,)) for i in range(5)] - for t in threads: - t.start() - for t in threads: - t.join() - - assert not errors - - resolved = str(Path(target_path).resolve()) - rows = writable_store.execute_query("SELECT content FROM files WHERE path=?", (resolved,)) - assert len(rows) == 1 - assert rows[0]["content"] in set(update_contents) - - def test_wal_mode_is_active_for_thread_connections(self, writable_store): - """PRAGMA journal_mode should be WAL for all thread-local connections.""" - modes = [] - errors = [] - lock = threading.Lock() - - def worker(): - try: - conn = writable_store._get_connection() - mode = conn.execute("PRAGMA journal_mode").fetchone()[0] - with lock: - modes.append(str(mode).lower()) - except Exception as exc: - with lock: - errors.append(exc) - - threads = [threading.Thread(target=worker) for _ in range(10)] - for t in threads: - t.start() - for t in threads: - t.join() - - assert not errors - assert modes - assert all(mode == "wal" for mode in modes) - - def test_transaction_isolation_reader_sees_committed_state(self, writable_store): - """Readers should not see uncommitted writer updates and should not block.""" - target_path = "/write/isolation.py" - indexed_file = IndexedFile(path=target_path, language="python", symbols=[]) - writable_store.add_file(indexed_file, "print('original')\n") - resolved = str(Path(target_path).resolve()) - - writer_started = threading.Event() - reader_done = threading.Event() - errors = [] - lock = threading.Lock() - observed = {"reader": None} - updated_content = "print('updated')\n" - - def writer(): - try: - conn = writable_store._get_connection() - conn.execute("BEGIN IMMEDIATE") - conn.execute( - "UPDATE files SET content=? WHERE path=?", - (updated_content, resolved), - ) - writer_started.set() - reader_done.wait(timeout=5) - conn.commit() - except Exception as exc: - with lock: - errors.append(exc) - - def reader(): - try: - writer_started.wait(timeout=5) - conn = writable_store._get_connection() - row = conn.execute("SELECT content FROM files WHERE path=?", (resolved,)).fetchone() - observed["reader"] = row[0] if row else None - reader_done.set() - except Exception as exc: - with lock: - errors.append(exc) - - threads = [threading.Thread(target=writer), threading.Thread(target=reader)] - for t in threads: - t.start() - for t in threads: - t.join() - - assert not errors - assert observed["reader"] == "print('original')\n" - - rows = writable_store.execute_query("SELECT content FROM files WHERE path=?", (resolved,)) - assert rows[0]["content"] == updated_content - - def test_batch_insert_performance_and_counts(self, writable_store): - """Batch inserts across threads should not lose rows.""" - thread_count = 10 - files_per_thread = 100 - errors = [] - lock = threading.Lock() - - def worker(thread_index: int): - try: - files = [] - for i in range(files_per_thread): - path = f"/write/batch_{thread_index}/file_{i}.py" - indexed_file = IndexedFile( - path=path, - language="python", - symbols=[ - Symbol(name=f"sym_{thread_index}_{i}", kind="function", range=(1, 1)) - ], - ) - content = f"# batch_token_{thread_index}_{i}\nprint({i})\n" - files.append((indexed_file, content)) - - writable_store.add_files(files) - except Exception as exc: - with lock: - errors.append(exc) - - start = time.time() - threads = [threading.Thread(target=worker, args=(i,)) for i in range(thread_count)] - for t in threads: - t.start() - for t in threads: - t.join() - duration = max(time.time() - start, 1e-6) - - assert not errors - stats = writable_store.stats() - assert stats["files"] == thread_count * files_per_thread - assert stats["symbols"] == thread_count * files_per_thread - assert (thread_count * files_per_thread) / duration > 0 - - def test_mixed_read_write_operations_no_errors(self, writable_store): - """Mixed reader and writer threads should complete without exceptions.""" - writer_threads = 5 - reader_threads = 10 - writes_per_writer = 20 - reads_per_reader = 50 - - errors = [] - lock = threading.Lock() - target_paths = [ - f"/write/mixed_{w}/file_{i}.py" - for w in range(writer_threads) - for i in range(writes_per_writer) - ] - - def writer(worker_index: int): - try: - for i in range(writes_per_writer): - path = f"/write/mixed_{worker_index}/file_{i}.py" - indexed_file = IndexedFile(path=path, language="python", symbols=[]) - writable_store.add_file(indexed_file, f"# mixed\nprint({i})\n") - except Exception as exc: - with lock: - errors.append(exc) - - def reader(worker_index: int): - try: - for i in range(reads_per_reader): - path = target_paths[(worker_index + i) % len(target_paths)] - writable_store.file_exists(path) - except Exception as exc: - with lock: - errors.append(exc) - - threads = [ - *[threading.Thread(target=writer, args=(i,)) for i in range(writer_threads)], - *[threading.Thread(target=reader, args=(i,)) for i in range(reader_threads)], - ] - for t in threads: - t.start() - for t in threads: - t.join() - - assert not errors - stats = writable_store.stats() - assert stats["files"] == writer_threads * writes_per_writer - - -class TestConnectionPooling: - """Stress tests for SQLiteStore thread-local connection pooling.""" - - def test_pool_size_never_exceeds_max_during_sequential_churn(self, writable_store): - """Pool should remain bounded when threads churn and stale connections are cleaned.""" - max_pool_size = writable_store.MAX_POOL_SIZE - - def make_thread(): - def worker(): - writable_store._get_connection() - - t = threading.Thread(target=worker) - t.start() - t.join() - - for _ in range(max_pool_size + 50): - make_thread() - writable_store._cleanup_stale_connections() - assert len(writable_store._pool) <= max_pool_size - - def test_pool_shrinks_after_threads_terminate(self, writable_store): - """After threads terminate, cleanup should remove their pooled connections.""" - thread_count = 20 - barrier = threading.Barrier(thread_count + 1) - created = [] - errors = [] - lock = threading.Lock() - current_tid = threading.get_ident() - - def worker(): - try: - writable_store._get_connection() - with lock: - created.append(threading.get_ident()) - barrier.wait(timeout=5) - except Exception as exc: - with lock: - errors.append(exc) - - threads = [threading.Thread(target=worker) for _ in range(thread_count)] - for t in threads: - t.start() - - barrier.wait(timeout=5) - assert not errors - assert len(writable_store._pool) >= thread_count - - for t in threads: - t.join() - - writable_store._cleanup_stale_connections() - assert all(tid not in writable_store._pool for tid in created) - assert set(writable_store._pool.keys()).issubset({current_tid}) - - def test_connection_identity_remains_stable_for_active_thread(self, writable_store): - """An active thread should keep using the same connection object.""" - main_conn = writable_store._get_connection() - errors = [] - lock = threading.Lock() - - def worker(): - try: - writable_store._get_connection() - except Exception as exc: - with lock: - errors.append(exc) - - threads = [threading.Thread(target=worker) for _ in range(15)] - for t in threads: - t.start() - for t in threads: - t.join() - - assert not errors - assert writable_store._get_connection() is main_conn - - def test_close_invalidates_connections_and_generation(self, tmp_path): - """close() should clear the pool and force new connections via generation increment.""" - store = SQLiteStore(tmp_path / "pool-close.db") - store.initialize() - try: - conn_before = store._get_connection() - generation_before = store._pool_generation - - store.close() - - assert store._pool_generation == generation_before + 1 - assert store._pool == {} - - conn_after = store._get_connection() - assert conn_after is not conn_before - assert getattr(store._local, "generation", None) == store._pool_generation - finally: - store.close() diff --git a/codex-lens/tests/test_symbol_extractor.py b/codex-lens/tests/test_symbol_extractor.py deleted file mode 100644 index be26e606..00000000 --- a/codex-lens/tests/test_symbol_extractor.py +++ /dev/null @@ -1,238 +0,0 @@ -"""Tests for symbol extraction and relationship tracking.""" -import tempfile -from pathlib import Path - -import pytest - -from codexlens.indexing.symbol_extractor import SymbolExtractor - - -@pytest.fixture -def extractor(): - """Create a temporary symbol extractor for testing.""" - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "test.db" - ext = SymbolExtractor(db_path) - ext.connect() - yield ext - ext.close() - - -class TestSymbolExtractor: - """Test suite for SymbolExtractor.""" - - def test_database_schema_creation(self, extractor): - """Test that database tables and indexes are created correctly.""" - cursor = extractor.db_conn.cursor() - - # Check symbols table exists - cursor.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='symbols'" - ) - assert cursor.fetchone() is not None - - # Check symbol_relationships table exists - cursor.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='symbol_relationships'" - ) - assert cursor.fetchone() is not None - - # Check indexes exist - cursor.execute( - "SELECT COUNT(*) FROM sqlite_master WHERE type='index' AND name LIKE 'idx_%'" - ) - idx_count = cursor.fetchone()[0] - assert idx_count == 5 - - def test_python_function_extraction(self, extractor): - """Test extracting functions from Python code.""" - code = """ -def hello(): - pass - -async def world(): - pass -""" - symbols, _ = extractor.extract_from_file(Path("test.py"), code) - - assert len(symbols) == 2 - assert symbols[0]["name"] == "hello" - assert symbols[0]["kind"] == "function" - assert symbols[1]["name"] == "world" - assert symbols[1]["kind"] == "function" - - def test_python_class_extraction(self, extractor): - """Test extracting classes from Python code.""" - code = """ -class MyClass: - pass - -class AnotherClass(BaseClass): - pass -""" - symbols, _ = extractor.extract_from_file(Path("test.py"), code) - - assert len(symbols) == 2 - assert symbols[0]["name"] == "MyClass" - assert symbols[0]["kind"] == "class" - assert symbols[1]["name"] == "AnotherClass" - assert symbols[1]["kind"] == "class" - - def test_typescript_extraction(self, extractor): - """Test extracting symbols from TypeScript code.""" - code = """ -export function calculateSum(a: number, b: number): number { - return a + b; -} - -export class Calculator { - multiply(x: number, y: number) { - return x * y; - } -} -""" - symbols, _ = extractor.extract_from_file(Path("test.ts"), code) - - assert len(symbols) == 2 - assert symbols[0]["name"] == "calculateSum" - assert symbols[0]["kind"] == "function" - assert symbols[1]["name"] == "Calculator" - assert symbols[1]["kind"] == "class" - - def test_javascript_extraction(self, extractor): - """Test extracting symbols from JavaScript code.""" - code = """ -function processData(data) { - return data; -} - -class DataProcessor { - transform(input) { - return input; - } -} -""" - symbols, _ = extractor.extract_from_file(Path("test.js"), code) - - assert len(symbols) == 2 - assert symbols[0]["name"] == "processData" - assert symbols[1]["name"] == "DataProcessor" - - def test_relationship_extraction(self, extractor): - """Test extracting relationships between symbols.""" - code = """ -def helper(): - pass - -def main(): - helper() - print("done") -""" - _, relationships = extractor.extract_from_file(Path("test.py"), code) - - # Should find calls to helper and print - call_targets = [r["target"] for r in relationships if r["type"] == "calls"] - assert "helper" in call_targets - - def test_save_and_query_symbols(self, extractor): - """Test saving symbols to database and querying them.""" - code = """ -def test_func(): - pass - -class TestClass: - pass -""" - symbols, _ = extractor.extract_from_file(Path("test.py"), code) - name_to_id = extractor.save_symbols(symbols) - - assert len(name_to_id) == 2 - assert "test_func" in name_to_id - assert "TestClass" in name_to_id - - # Query database - cursor = extractor.db_conn.cursor() - cursor.execute("SELECT COUNT(*) FROM symbols") - count = cursor.fetchone()[0] - assert count == 2 - - def test_save_relationships(self, extractor): - """Test saving relationships to database.""" - code = """ -def caller(): - callee() - -def callee(): - pass -""" - symbols, relationships = extractor.extract_from_file(Path("test.py"), code) - name_to_id = extractor.save_symbols(symbols) - extractor.save_relationships(relationships, name_to_id) - - # Query database - cursor = extractor.db_conn.cursor() - cursor.execute("SELECT COUNT(*) FROM symbol_relationships") - count = cursor.fetchone()[0] - assert count > 0 - - def test_qualified_name_generation(self, extractor): - """Test that qualified names are generated correctly.""" - code = """ -class MyClass: - pass -""" - symbols, _ = extractor.extract_from_file(Path("module.py"), code) - - assert symbols[0]["qualified_name"] == "module.MyClass" - - def test_unsupported_language(self, extractor): - """Test that unsupported languages return empty results.""" - code = "some random code" - symbols, relationships = extractor.extract_from_file(Path("test.txt"), code) - - assert len(symbols) == 0 - assert len(relationships) == 0 - - def test_empty_file(self, extractor): - """Test handling empty files.""" - symbols, relationships = extractor.extract_from_file(Path("test.py"), "") - - assert len(symbols) == 0 - assert len(relationships) == 0 - - def test_complete_workflow(self, extractor): - """Test complete workflow: extract, save, and verify.""" - code = """ -class UserService: - def get_user(self, user_id): - return fetch_user(user_id) - -def main(): - service = UserService() - service.get_user(1) -""" - file_path = Path("service.py") - symbols, relationships = extractor.extract_from_file(file_path, code) - - # Save to database - name_to_id = extractor.save_symbols(symbols) - extractor.save_relationships(relationships, name_to_id) - - # Verify symbols - cursor = extractor.db_conn.cursor() - cursor.execute("SELECT name, kind FROM symbols ORDER BY start_line") - db_symbols = cursor.fetchall() - assert len(db_symbols) == 2 - assert db_symbols[0][0] == "UserService" - assert db_symbols[1][0] == "main" - - # Verify relationships - cursor.execute( - """ - SELECT s.name, r.target_symbol_fqn, r.relationship_type - FROM symbol_relationships r - JOIN symbols s ON r.source_symbol_id = s.id - """ - ) - db_rels = cursor.fetchall() - assert len(db_rels) > 0 diff --git a/codex-lens/tests/test_token_chunking.py b/codex-lens/tests/test_token_chunking.py deleted file mode 100644 index 39be7aa0..00000000 --- a/codex-lens/tests/test_token_chunking.py +++ /dev/null @@ -1,190 +0,0 @@ -"""Tests for token-aware chunking functionality.""" - -import pytest - -from codexlens.entities import SemanticChunk, Symbol -from codexlens.semantic.chunker import ChunkConfig, Chunker, HybridChunker -from codexlens.parsers.tokenizer import get_default_tokenizer - - -class TestTokenAwareChunking: - """Tests for token counting integration in chunking.""" - - def test_chunker_adds_token_count_to_chunks(self): - """Test that chunker adds token_count metadata to chunks.""" - config = ChunkConfig(min_chunk_size=5) - chunker = Chunker(config=config) - - content = '''def hello(): - return "world" - -def goodbye(): - return "farewell" -''' - symbols = [ - Symbol(name="hello", kind="function", range=(1, 2)), - Symbol(name="goodbye", kind="function", range=(4, 5)), - ] - - chunks = chunker.chunk_file(content, symbols, "test.py", "python") - - # All chunks should have token_count metadata - assert all("token_count" in c.metadata for c in chunks) - - # Token counts should be positive integers - for chunk in chunks: - token_count = chunk.metadata["token_count"] - assert isinstance(token_count, int) - assert token_count > 0 - - def test_chunker_accepts_precomputed_token_counts(self): - """Test that chunker can accept precomputed token counts.""" - config = ChunkConfig(min_chunk_size=5) - chunker = Chunker(config=config) - - content = '''def hello(): - return "world" -''' - symbols = [Symbol(name="hello", kind="function", range=(1, 2))] - - # Provide precomputed token count - symbol_token_counts = {"hello": 42} - - chunks = chunker.chunk_file(content, symbols, "test.py", "python", symbol_token_counts) - - assert len(chunks) == 1 - assert chunks[0].metadata["token_count"] == 42 - - def test_sliding_window_includes_token_count(self): - """Test that sliding window chunking includes token counts.""" - config = ChunkConfig(min_chunk_size=5, max_chunk_size=100) - chunker = Chunker(config=config) - - # Create content without symbols to trigger sliding window - content = "x = 1\ny = 2\nz = 3\n" * 20 - - chunks = chunker.chunk_sliding_window(content, "test.py", "python") - - assert len(chunks) > 0 - for chunk in chunks: - assert "token_count" in chunk.metadata - assert chunk.metadata["token_count"] > 0 - - def test_hybrid_chunker_adds_token_count(self): - """Test that hybrid chunker adds token counts to all chunk types.""" - config = ChunkConfig(min_chunk_size=5) - chunker = HybridChunker(config=config) - - content = '''"""Module docstring.""" - -def hello(): - """Function docstring.""" - return "world" -''' - symbols = [Symbol(name="hello", kind="function", range=(3, 5))] - - chunks = chunker.chunk_file(content, symbols, "test.py", "python") - - # All chunks (docstrings and code) should have token_count - assert all("token_count" in c.metadata for c in chunks) - - docstring_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "docstring"] - code_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "code"] - - assert len(docstring_chunks) > 0 - assert len(code_chunks) > 0 - - # Verify all have valid token counts - for chunk in chunks: - assert chunk.metadata["token_count"] > 0 - - def test_token_count_matches_tiktoken(self): - """Test that token counts match tiktoken output.""" - config = ChunkConfig(min_chunk_size=5) - chunker = Chunker(config=config) - tokenizer = get_default_tokenizer() - - content = '''def calculate(x, y): - """Calculate sum of x and y.""" - return x + y -''' - symbols = [Symbol(name="calculate", kind="function", range=(1, 3))] - - chunks = chunker.chunk_file(content, symbols, "test.py", "python") - - assert len(chunks) == 1 - chunk = chunks[0] - - # Manually count tokens for verification - expected_count = tokenizer.count_tokens(chunk.content) - assert chunk.metadata["token_count"] == expected_count - - def test_token_count_fallback_to_calculation(self): - """Test that token count is calculated when not precomputed.""" - config = ChunkConfig(min_chunk_size=5) - chunker = Chunker(config=config) - - content = '''def test(): - pass -''' - symbols = [Symbol(name="test", kind="function", range=(1, 2))] - - # Don't provide symbol_token_counts - should calculate automatically - chunks = chunker.chunk_file(content, symbols, "test.py", "python") - - assert len(chunks) == 1 - assert "token_count" in chunks[0].metadata - assert chunks[0].metadata["token_count"] > 0 - - -class TestTokenCountPerformance: - """Tests for token counting performance optimization.""" - - def test_precomputed_tokens_avoid_recalculation(self): - """Test that providing precomputed token counts avoids recalculation.""" - import time - - config = ChunkConfig(min_chunk_size=5) - chunker = Chunker(config=config) - tokenizer = get_default_tokenizer() - - # Create larger content - lines = [] - for i in range(100): - lines.append(f'def func{i}(x):\n') - lines.append(f' return x * {i}\n') - lines.append('\n') - content = "".join(lines) - - symbols = [ - Symbol(name=f"func{i}", kind="function", range=(1 + i*3, 2 + i*3)) - for i in range(100) - ] - - # Precompute token counts - symbol_token_counts = {} - for symbol in symbols: - start_idx = symbol.range[0] - 1 - end_idx = symbol.range[1] - chunk_content = "".join(content.splitlines(keepends=True)[start_idx:end_idx]) - symbol_token_counts[symbol.name] = tokenizer.count_tokens(chunk_content) - - # Time with precomputed counts (3 runs) - precomputed_times = [] - for _ in range(3): - start = time.perf_counter() - chunker.chunk_file(content, symbols, "test.py", "python", symbol_token_counts) - precomputed_times.append(time.perf_counter() - start) - precomputed_time = sum(precomputed_times) / len(precomputed_times) - - # Time without precomputed counts (3 runs) - computed_times = [] - for _ in range(3): - start = time.perf_counter() - chunker.chunk_file(content, symbols, "test.py", "python") - computed_times.append(time.perf_counter() - start) - computed_time = sum(computed_times) / len(computed_times) - - # Precomputed should be at least 10% faster - speedup = ((computed_time - precomputed_time) / computed_time) * 100 - assert speedup >= 10.0, f"Speedup {speedup:.2f}% < 10% (computed={computed_time:.4f}s, precomputed={precomputed_time:.4f}s)" diff --git a/codex-lens/tests/test_token_storage.py b/codex-lens/tests/test_token_storage.py deleted file mode 100644 index 68391ca7..00000000 --- a/codex-lens/tests/test_token_storage.py +++ /dev/null @@ -1,368 +0,0 @@ -"""Integration tests for token metadata storage and retrieval.""" - -import pytest -import tempfile -from pathlib import Path - -from codexlens.entities import Symbol, IndexedFile -from codexlens.storage.sqlite_store import SQLiteStore -from codexlens.storage.dir_index import DirIndexStore -from codexlens.storage.migration_manager import MigrationManager - - -class TestTokenMetadataStorage: - """Tests for storing and retrieving token metadata.""" - - def test_sqlite_store_saves_token_count(self): - """Test that SQLiteStore saves token_count for symbols.""" - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "test.db" - store = SQLiteStore(db_path) - - with store: - # Create indexed file with symbols containing token counts - symbols = [ - Symbol( - name="func1", - kind="function", - range=(1, 5), - token_count=42, - symbol_type="function_definition" - ), - Symbol( - name="func2", - kind="function", - range=(7, 12), - token_count=73, - symbol_type="function_definition" - ), - ] - - indexed_file = IndexedFile( - path=str(Path(tmpdir) / "test.py"), - language="python", - symbols=symbols - ) - - content = "def func1():\n pass\n\ndef func2():\n pass\n" - store.add_file(indexed_file, content) - - # Retrieve symbols and verify token_count is saved - retrieved_symbols = store.search_symbols("func", limit=10) - - assert len(retrieved_symbols) == 2 - - # Check that symbols have token_count attribute - # Note: search_symbols currently doesn't return token_count - # This test verifies the data is stored correctly in the database - - def test_dir_index_store_saves_token_count(self): - """Test that DirIndexStore saves token_count for symbols.""" - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "_index.db" - store = DirIndexStore(db_path) - - with store: - symbols = [ - Symbol( - name="calculate", - kind="function", - range=(1, 10), - token_count=128, - symbol_type="function_definition" - ), - ] - - file_id = store.add_file( - name="math.py", - full_path=Path(tmpdir) / "math.py", - content="def calculate(x, y):\n return x + y\n", - language="python", - symbols=symbols - ) - - assert file_id > 0 - - # Verify file was stored - file_entry = store.get_file(Path(tmpdir) / "math.py") - assert file_entry is not None - assert file_entry.name == "math.py" - - def test_migration_adds_token_columns(self): - """Test that migrations properly handle token_count and symbol_type columns. - - Note: Migration 002 adds these columns, but migration 005 removes them - as they were identified as unused/redundant. New databases should not - have these columns. - """ - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "test.db" - store = SQLiteStore(db_path) - - with store: - # Apply migrations - conn = store._get_connection() - manager = MigrationManager(conn) - manager.apply_migrations() - - # Verify columns do NOT exist after all migrations - # (migration_005 removes token_count and symbol_type) - cursor = conn.execute("PRAGMA table_info(symbols)") - columns = {row[1] for row in cursor.fetchall()} - - # These columns should NOT be present after migration_005 - assert "token_count" not in columns, "token_count should be removed by migration_005" - assert "symbol_type" not in columns, "symbol_type should be removed by migration_005" - - # Index on symbol_type should also not exist - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='index' AND name='idx_symbols_type'" - ) - index = cursor.fetchone() - assert index is None, "idx_symbols_type should not exist after migration_005" - - def test_batch_insert_preserves_token_metadata(self): - """Test that batch insert preserves token metadata.""" - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "test.db" - store = SQLiteStore(db_path) - - with store: - files_data = [] - - for i in range(5): - symbols = [ - Symbol( - name=f"func{i}", - kind="function", - range=(1, 3), - token_count=10 + i, - symbol_type="function_definition" - ), - ] - - indexed_file = IndexedFile( - path=str(Path(tmpdir) / f"test{i}.py"), - language="python", - symbols=symbols - ) - - content = f"def func{i}():\n pass\n" - files_data.append((indexed_file, content)) - - # Batch insert - store.add_files(files_data) - - # Verify all files were stored - stats = store.stats() - assert stats["files"] == 5 - assert stats["symbols"] == 5 - - def test_symbol_type_defaults_to_kind(self): - """Test that symbol_type defaults to kind when not specified.""" - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "_index.db" - store = DirIndexStore(db_path) - - with store: - # Symbol without explicit symbol_type - symbols = [ - Symbol( - name="MyClass", - kind="class", - range=(1, 10), - token_count=200 - ), - ] - - store.add_file( - name="module.py", - full_path=Path(tmpdir) / "module.py", - content="class MyClass:\n pass\n", - language="python", - symbols=symbols - ) - - # Verify it was stored (symbol_type should default to 'class') - file_entry = store.get_file(Path(tmpdir) / "module.py") - assert file_entry is not None - - def test_null_token_count_allowed(self): - """Test that NULL token_count is allowed for backward compatibility.""" - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "test.db" - store = SQLiteStore(db_path) - - with store: - # Symbol without token_count (None) - symbols = [ - Symbol( - name="legacy_func", - kind="function", - range=(1, 5) - ), - ] - - indexed_file = IndexedFile( - path=str(Path(tmpdir) / "legacy.py"), - language="python", - symbols=symbols - ) - - content = "def legacy_func():\n pass\n" - store.add_file(indexed_file, content) - - # Should not raise an error - stats = store.stats() - assert stats["symbols"] == 1 - - def test_search_by_symbol_type(self): - """Test searching/filtering symbols by symbol_type.""" - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "_index.db" - store = DirIndexStore(db_path) - - with store: - # Add symbols with different types - symbols = [ - Symbol( - name="MyClass", - kind="class", - range=(1, 10), - symbol_type="class_definition" - ), - Symbol( - name="my_function", - kind="function", - range=(12, 15), - symbol_type="function_definition" - ), - Symbol( - name="my_method", - kind="method", - range=(5, 8), - symbol_type="method_definition" - ), - ] - - store.add_file( - name="code.py", - full_path=Path(tmpdir) / "code.py", - content="class MyClass:\n def my_method(self):\n pass\n\ndef my_function():\n pass\n", - language="python", - symbols=symbols - ) - - # Search for functions only - function_symbols = store.search_symbols("my", kind="function", limit=10) - assert len(function_symbols) == 1 - assert function_symbols[0].name == "my_function" - - # Search for methods only - method_symbols = store.search_symbols("my", kind="method", limit=10) - assert len(method_symbols) == 1 - assert method_symbols[0].name == "my_method" - - -class TestTokenCountAccuracy: - """Tests for symbol storage accuracy. - - Note: token_count and symbol_type columns were removed in migration_005 - as they were identified as unused/redundant. These tests now verify - that symbols are stored correctly with their basic fields. - """ - - def test_stored_token_count_matches_original(self): - """Test that symbols are stored correctly (token_count no longer stored). - - Note: token_count field was removed from schema. This test verifies - that symbols are still stored correctly with basic fields. - """ - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "test.db" - store = SQLiteStore(db_path) - - with store: - symbols = [ - Symbol( - name="complex_func", - kind="function", - range=(1, 20), - token_count=256 # This field is accepted but not stored - ), - ] - - indexed_file = IndexedFile( - path=str(Path(tmpdir) / "test.py"), - language="python", - symbols=symbols - ) - - content = "def complex_func():\n # Some complex logic\n pass\n" - store.add_file(indexed_file, content) - - # Verify symbol is stored with basic fields - conn = store._get_connection() - cursor = conn.execute( - "SELECT name, kind, start_line, end_line FROM symbols WHERE name = ?", - ("complex_func",) - ) - row = cursor.fetchone() - - assert row is not None - assert row["name"] == "complex_func" - assert row["kind"] == "function" - assert row["start_line"] == 1 - assert row["end_line"] == 20 - - def test_100_percent_storage_accuracy(self): - """Test that 100% of symbols are stored correctly. - - Note: token_count field was removed from schema. This test verifies - that symbols are stored correctly with basic fields. - """ - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "_index.db" - store = DirIndexStore(db_path) - - with store: - # Store symbols - file_entries = [] - for i in range(100): - symbol_name = f"func{i}" - - symbols = [ - Symbol( - name=symbol_name, - kind="function", - range=(1, 2), - token_count=10 + i * 3 # Accepted but not stored - ) - ] - - file_path = Path(tmpdir) / f"file{i}.py" - file_entries.append(( - f"file{i}.py", - file_path, - f"def {symbol_name}():\n pass\n", - "python", - symbols - )) - - count = store.add_files_batch(file_entries) - assert count == 100 - - # Verify all symbols are stored correctly - conn = store._get_connection() - cursor = conn.execute( - "SELECT name, kind, start_line, end_line FROM symbols ORDER BY name" - ) - rows = cursor.fetchall() - - assert len(rows) == 100 - - # Verify each symbol has correct basic fields - for row in rows: - assert row["kind"] == "function" - assert row["start_line"] == 1 - assert row["end_line"] == 2 diff --git a/codex-lens/tests/test_tokenizer.py b/codex-lens/tests/test_tokenizer.py deleted file mode 100644 index edf086d1..00000000 --- a/codex-lens/tests/test_tokenizer.py +++ /dev/null @@ -1,162 +0,0 @@ -"""Tests for tokenizer module.""" - -import pytest - -from codexlens.parsers.tokenizer import ( - Tokenizer, - count_tokens, - get_default_tokenizer, -) - - -class TestTokenizer: - """Tests for Tokenizer class.""" - - def test_empty_text(self): - tokenizer = Tokenizer() - assert tokenizer.count_tokens("") == 0 - - def test_simple_text(self): - tokenizer = Tokenizer() - text = "Hello world" - count = tokenizer.count_tokens(text) - assert count > 0 - # Should be roughly text length / 4 for fallback - assert count >= len(text) // 5 - - def test_long_text(self): - tokenizer = Tokenizer() - text = "def hello():\n pass\n" * 100 - count = tokenizer.count_tokens(text) - assert count > 0 - # Verify it's proportional to length - assert count >= len(text) // 5 - - def test_code_text(self): - tokenizer = Tokenizer() - code = """ -def calculate_fibonacci(n): - if n <= 1: - return n - return calculate_fibonacci(n-1) + calculate_fibonacci(n-2) - -class MathHelper: - def factorial(self, n): - if n <= 1: - return 1 - return n * self.factorial(n - 1) -""" - count = tokenizer.count_tokens(code) - assert count > 0 - - def test_unicode_text(self): - tokenizer = Tokenizer() - text = "你好世界 Hello World" - count = tokenizer.count_tokens(text) - assert count > 0 - - def test_special_characters(self): - tokenizer = Tokenizer() - text = "!@#$%^&*()_+-=[]{}|;':\",./<>?" - count = tokenizer.count_tokens(text) - assert count > 0 - - def test_is_using_tiktoken_check(self): - tokenizer = Tokenizer() - # Should return bool indicating if tiktoken is available - result = tokenizer.is_using_tiktoken() - assert isinstance(result, bool) - - -class TestTokenizerFallback: - """Tests for character count fallback.""" - - def test_character_count_fallback(self): - # Test with potentially unavailable encoding - tokenizer = Tokenizer(encoding_name="nonexistent_encoding") - text = "Hello world" - count = tokenizer.count_tokens(text) - # Should fall back to character counting - assert count == max(1, len(text) // 4) - - def test_fallback_minimum_count(self): - tokenizer = Tokenizer(encoding_name="nonexistent_encoding") - # Very short text should still return at least 1 - assert tokenizer.count_tokens("hi") >= 1 - - -class TestGlobalTokenizer: - """Tests for global tokenizer functions.""" - - def test_get_default_tokenizer(self): - tokenizer1 = get_default_tokenizer() - tokenizer2 = get_default_tokenizer() - # Should return the same instance - assert tokenizer1 is tokenizer2 - - def test_count_tokens_default(self): - text = "Hello world" - count = count_tokens(text) - assert count > 0 - - def test_count_tokens_custom_tokenizer(self): - custom_tokenizer = Tokenizer() - text = "Hello world" - count = count_tokens(text, tokenizer=custom_tokenizer) - assert count > 0 - - -class TestTokenizerPerformance: - """Performance-related tests.""" - - def test_large_file_tokenization(self): - """Test tokenization of large file content.""" - tokenizer = Tokenizer() - # Simulate a 1MB file - each line is ~126 chars, need ~8000 lines - large_text = "def function_{}():\n pass\n".format("x" * 100) * 8000 - assert len(large_text) > 1_000_000 - - count = tokenizer.count_tokens(large_text) - assert count > 0 - # Verify reasonable token count (at least 10k tokens for 1MB) - # Note: Modern tokenizers compress repetitive content efficiently - assert count >= 10000 - - def test_multiple_tokenizations(self): - """Test multiple tokenization calls.""" - tokenizer = Tokenizer() - text = "def hello(): pass" - - # Multiple calls should return same result - count1 = tokenizer.count_tokens(text) - count2 = tokenizer.count_tokens(text) - assert count1 == count2 - - -class TestTokenizerEdgeCases: - """Edge case tests.""" - - def test_only_whitespace(self): - tokenizer = Tokenizer() - count = tokenizer.count_tokens(" \n\t ") - assert count >= 0 - - def test_very_long_line(self): - tokenizer = Tokenizer() - long_line = "a" * 10000 - count = tokenizer.count_tokens(long_line) - assert count > 0 - - def test_mixed_content(self): - tokenizer = Tokenizer() - mixed = """ -# Comment -def func(): - '''Docstring''' - pass - -123.456 -"string" -""" - count = tokenizer.count_tokens(mixed) - assert count > 0 diff --git a/codex-lens/tests/test_tokenizer_performance.py b/codex-lens/tests/test_tokenizer_performance.py deleted file mode 100644 index bfee530f..00000000 --- a/codex-lens/tests/test_tokenizer_performance.py +++ /dev/null @@ -1,127 +0,0 @@ -"""Performance benchmarks for tokenizer. - -Verifies that tiktoken-based tokenization is at least 50% faster than -pure Python implementation for files >1MB. -""" - -import time -from pathlib import Path - -import pytest - -from codexlens.parsers.tokenizer import Tokenizer, TIKTOKEN_AVAILABLE - - -def pure_python_token_count(text: str) -> int: - """Pure Python token counting fallback (character count / 4).""" - if not text: - return 0 - return max(1, len(text) // 4) - - -@pytest.mark.skipif(not TIKTOKEN_AVAILABLE, reason="tiktoken not installed") -class TestTokenizerPerformance: - """Performance benchmarks comparing tiktoken vs pure Python.""" - - def test_performance_improvement_large_file(self): - """Verify tiktoken is at least 50% faster for files >1MB.""" - # Create a large file (>1MB) - large_text = "def function_{}():\n pass\n".format("x" * 100) * 8000 - assert len(large_text) > 1_000_000 - - # Warm up - tokenizer = Tokenizer() - tokenizer.count_tokens(large_text[:1000]) - pure_python_token_count(large_text[:1000]) - - # Benchmark tiktoken - tiktoken_times = [] - for _ in range(10): - start = time.perf_counter() - tokenizer.count_tokens(large_text) - end = time.perf_counter() - tiktoken_times.append(end - start) - - tiktoken_avg = sum(tiktoken_times) / len(tiktoken_times) - - # Benchmark pure Python - python_times = [] - for _ in range(10): - start = time.perf_counter() - pure_python_token_count(large_text) - end = time.perf_counter() - python_times.append(end - start) - - python_avg = sum(python_times) / len(python_times) - - # Calculate speed improvement - # tiktoken should be at least 50% faster (meaning python takes at least 1.5x longer) - speedup = python_avg / tiktoken_avg - - print(f"\nPerformance results for {len(large_text):,} byte file:") - print(f" Tiktoken avg: {tiktoken_avg*1000:.2f}ms") - print(f" Pure Python avg: {python_avg*1000:.2f}ms") - print(f" Speedup: {speedup:.2f}x") - - # For pure character counting, Python is actually faster since it's simpler - # The real benefit of tiktoken is ACCURACY, not speed - # So we adjust the test to verify tiktoken works correctly - assert tiktoken_avg < 1.0, "Tiktoken should complete in reasonable time" - assert speedup > 0, "Should have valid performance measurement" - - def test_accuracy_comparison(self): - """Verify tiktoken provides more accurate token counts.""" - code = """ -class Calculator: - def __init__(self): - self.value = 0 - - def add(self, x, y): - return x + y - - def multiply(self, x, y): - return x * y -""" - tokenizer = Tokenizer() - if tokenizer.is_using_tiktoken(): - tiktoken_count = tokenizer.count_tokens(code) - python_count = pure_python_token_count(code) - - # Tiktoken should give different (more accurate) count than naive char/4 - # They might be close, but tiktoken accounts for token boundaries - assert tiktoken_count > 0 - assert python_count > 0 - - # Both should be in reasonable range for this code - assert 20 < tiktoken_count < 100 - assert 20 < python_count < 100 - - def test_consistent_results(self): - """Verify tiktoken gives consistent results.""" - code = "def hello(): pass" - tokenizer = Tokenizer() - - if tokenizer.is_using_tiktoken(): - results = [tokenizer.count_tokens(code) for _ in range(100)] - # All results should be identical - assert len(set(results)) == 1 - - -class TestTokenizerWithoutTiktoken: - """Tests for behavior when tiktoken is unavailable.""" - - def test_fallback_performance(self): - """Verify fallback is still fast.""" - # Use invalid encoding to force fallback - tokenizer = Tokenizer(encoding_name="invalid_encoding") - large_text = "x" * 1_000_000 - - start = time.perf_counter() - count = tokenizer.count_tokens(large_text) - end = time.perf_counter() - - elapsed = end - start - - # Character counting should be very fast - assert elapsed < 0.1 # Should take less than 100ms - assert count == len(large_text) // 4 diff --git a/codex-lens/tests/test_treesitter_parser.py b/codex-lens/tests/test_treesitter_parser.py deleted file mode 100644 index 62303fc5..00000000 --- a/codex-lens/tests/test_treesitter_parser.py +++ /dev/null @@ -1,377 +0,0 @@ -"""Tests for TreeSitterSymbolParser.""" - -from pathlib import Path - -import pytest - -from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser, TREE_SITTER_AVAILABLE - - -@pytest.mark.skipif(not TREE_SITTER_AVAILABLE, reason="tree-sitter not installed") -class TestTreeSitterPythonParser: - """Tests for Python parsing with tree-sitter.""" - - def test_parse_simple_function(self): - parser = TreeSitterSymbolParser("python") - code = "def hello():\n pass" - result = parser.parse(code, Path("test.py")) - - assert result is not None - assert result.language == "python" - assert len(result.symbols) == 1 - assert result.symbols[0].name == "hello" - assert result.symbols[0].kind == "function" - - def test_parse_async_function(self): - parser = TreeSitterSymbolParser("python") - code = "async def fetch_data():\n pass" - result = parser.parse(code, Path("test.py")) - - assert result is not None - assert len(result.symbols) == 1 - assert result.symbols[0].name == "fetch_data" - assert result.symbols[0].kind == "function" - - def test_parse_class(self): - parser = TreeSitterSymbolParser("python") - code = "class MyClass:\n pass" - result = parser.parse(code, Path("test.py")) - - assert result is not None - assert len(result.symbols) == 1 - assert result.symbols[0].name == "MyClass" - assert result.symbols[0].kind == "class" - - def test_parse_method(self): - parser = TreeSitterSymbolParser("python") - code = """ -class MyClass: - def method(self): - pass -""" - result = parser.parse(code, Path("test.py")) - - assert result is not None - assert len(result.symbols) == 2 - assert result.symbols[0].name == "MyClass" - assert result.symbols[0].kind == "class" - assert result.symbols[1].name == "method" - assert result.symbols[1].kind == "method" - - def test_parse_nested_functions(self): - parser = TreeSitterSymbolParser("python") - code = """ -def outer(): - def inner(): - pass - return inner -""" - result = parser.parse(code, Path("test.py")) - - assert result is not None - names = [s.name for s in result.symbols] - assert "outer" in names - assert "inner" in names - - def test_parse_complex_file(self): - parser = TreeSitterSymbolParser("python") - code = """ -class Calculator: - def add(self, a, b): - return a + b - - def subtract(self, a, b): - return a - b - -def standalone_function(): - pass - -class DataProcessor: - async def process(self, data): - pass -""" - result = parser.parse(code, Path("test.py")) - - assert result is not None - assert len(result.symbols) >= 5 - - names_kinds = [(s.name, s.kind) for s in result.symbols] - assert ("Calculator", "class") in names_kinds - assert ("add", "method") in names_kinds - assert ("subtract", "method") in names_kinds - assert ("standalone_function", "function") in names_kinds - assert ("DataProcessor", "class") in names_kinds - assert ("process", "method") in names_kinds - - def test_parse_empty_file(self): - parser = TreeSitterSymbolParser("python") - result = parser.parse("", Path("test.py")) - - assert result is not None - assert len(result.symbols) == 0 - - def test_extracts_relationships_with_alias_resolution(self): - parser = TreeSitterSymbolParser("python") - code = """ -import os.path as osp -from math import sqrt as sq - -class Base: - pass - -class Child(Base): - pass - -def main(): - osp.join("a", "b") - sq(4) -""" - result = parser.parse(code, Path("test.py")) - - assert result is not None - - rels = [r for r in result.relationships if r.source_symbol == "main"] - targets = {r.target_symbol for r in rels if r.relationship_type.value == "calls"} - assert "os.path.join" in targets - assert "math.sqrt" in targets - - inherits = [ - r for r in result.relationships - if r.source_symbol == "Child" and r.relationship_type.value == "inherits" - ] - assert any(r.target_symbol == "Base" for r in inherits) - - -@pytest.mark.skipif(not TREE_SITTER_AVAILABLE, reason="tree-sitter not installed") -class TestTreeSitterJavaScriptParser: - """Tests for JavaScript parsing with tree-sitter.""" - - def test_parse_function(self): - parser = TreeSitterSymbolParser("javascript") - code = "function hello() {}" - result = parser.parse(code, Path("test.js")) - - assert result is not None - assert len(result.symbols) == 1 - assert result.symbols[0].name == "hello" - assert result.symbols[0].kind == "function" - - def test_parse_arrow_function(self): - parser = TreeSitterSymbolParser("javascript") - code = "const hello = () => {}" - result = parser.parse(code, Path("test.js")) - - assert result is not None - assert len(result.symbols) == 1 - assert result.symbols[0].name == "hello" - assert result.symbols[0].kind == "function" - - def test_parse_class(self): - parser = TreeSitterSymbolParser("javascript") - code = "class MyClass {}" - result = parser.parse(code, Path("test.js")) - - assert result is not None - assert len(result.symbols) == 1 - assert result.symbols[0].name == "MyClass" - assert result.symbols[0].kind == "class" - - def test_parse_class_with_methods(self): - parser = TreeSitterSymbolParser("javascript") - code = """ -class MyClass { - method() {} - async asyncMethod() {} -} -""" - result = parser.parse(code, Path("test.js")) - - assert result is not None - names_kinds = [(s.name, s.kind) for s in result.symbols] - assert ("MyClass", "class") in names_kinds - assert ("method", "method") in names_kinds - assert ("asyncMethod", "method") in names_kinds - - def test_parse_export_functions(self): - parser = TreeSitterSymbolParser("javascript") - code = """ -export function exported() {} -export const arrowFunc = () => {} -""" - result = parser.parse(code, Path("test.js")) - - assert result is not None - assert len(result.symbols) >= 2 - names = [s.name for s in result.symbols] - assert "exported" in names - assert "arrowFunc" in names - - def test_extracts_relationships_with_import_alias(self): - parser = TreeSitterSymbolParser("javascript") - code = """ -import { readFile as rf } from "fs"; - -function main() { - rf("a"); -} -""" - result = parser.parse(code, Path("test.js")) - - assert result is not None - rels = [r for r in result.relationships if r.source_symbol == "main"] - targets = {r.target_symbol for r in rels if r.relationship_type.value == "calls"} - assert "fs.readFile" in targets - - -@pytest.mark.skipif(not TREE_SITTER_AVAILABLE, reason="tree-sitter not installed") -class TestTreeSitterTypeScriptParser: - """Tests for TypeScript parsing with tree-sitter.""" - - def test_parse_typescript_function(self): - parser = TreeSitterSymbolParser("typescript") - code = "function greet(name: string): string { return name; }" - result = parser.parse(code, Path("test.ts")) - - assert result is not None - assert len(result.symbols) >= 1 - assert any(s.name == "greet" for s in result.symbols) - - def test_parse_typescript_class(self): - parser = TreeSitterSymbolParser("typescript") - code = """ -class Service { - process(data: string): void {} -} -""" - result = parser.parse(code, Path("test.ts")) - - assert result is not None - names = [s.name for s in result.symbols] - assert "Service" in names - - -class TestTreeSitterParserAvailability: - """Tests for parser availability checking.""" - - def test_is_available_python(self): - parser = TreeSitterSymbolParser("python") - # Should match TREE_SITTER_AVAILABLE - assert parser.is_available() == TREE_SITTER_AVAILABLE - - def test_is_available_javascript(self): - parser = TreeSitterSymbolParser("javascript") - assert isinstance(parser.is_available(), bool) - - def test_unsupported_language(self): - parser = TreeSitterSymbolParser("rust") - # Rust not configured, so should not be available - assert parser.is_available() is False - - -class TestTreeSitterParserFallback: - """Tests for fallback behavior when tree-sitter unavailable.""" - - def test_parse_returns_none_when_unavailable(self): - parser = TreeSitterSymbolParser("rust") # Unsupported language - code = "fn main() {}" - result = parser.parse(code, Path("test.rs")) - - # Should return None when parser unavailable - assert result is None - - -class TestTreeSitterTokenCounting: - """Tests for token counting functionality.""" - - @pytest.mark.skipif(not TREE_SITTER_AVAILABLE, reason="tree-sitter not installed") - def test_count_tokens(self): - parser = TreeSitterSymbolParser("python") - code = "def hello():\n pass" - count = parser.count_tokens(code) - - assert count > 0 - assert isinstance(count, int) - - @pytest.mark.skipif(not TREE_SITTER_AVAILABLE, reason="tree-sitter not installed") - def test_count_tokens_large_file(self): - parser = TreeSitterSymbolParser("python") - # Generate large code - code = "def func_{}():\n pass\n".format("x" * 100) * 1000 - - count = parser.count_tokens(code) - assert count > 0 - - -class TestTreeSitterAccuracy: - """Tests for >99% symbol extraction accuracy.""" - - @pytest.mark.skipif(not TREE_SITTER_AVAILABLE, reason="tree-sitter not installed") - def test_comprehensive_python_file(self): - parser = TreeSitterSymbolParser("python") - code = """ -# Module-level function -def module_func(): - pass - -class FirstClass: - def method1(self): - pass - - def method2(self): - pass - - async def async_method(self): - pass - -def another_function(): - def nested(): - pass - return nested - -class SecondClass: - class InnerClass: - def inner_method(self): - pass - - def outer_method(self): - pass - -async def async_function(): - pass -""" - result = parser.parse(code, Path("test.py")) - - assert result is not None - # Expected symbols: module_func, FirstClass, method1, method2, async_method, - # another_function, nested, SecondClass, InnerClass, inner_method, - # outer_method, async_function - # Should find at least 12 symbols with >99% accuracy - assert len(result.symbols) >= 12 - - @pytest.mark.skipif(not TREE_SITTER_AVAILABLE, reason="tree-sitter not installed") - def test_comprehensive_javascript_file(self): - parser = TreeSitterSymbolParser("javascript") - code = """ -function regularFunc() {} - -const arrowFunc = () => {} - -class MainClass { - method1() {} - async method2() {} - static staticMethod() {} -} - -export function exportedFunc() {} - -export class ExportedClass { - method() {} -} -""" - result = parser.parse(code, Path("test.js")) - - assert result is not None - # Expected: regularFunc, arrowFunc, MainClass, method1, method2, - # staticMethod, exportedFunc, ExportedClass, method - # Should find at least 9 symbols - assert len(result.symbols) >= 9 diff --git a/codex-lens/tests/test_vector_search_full.py b/codex-lens/tests/test_vector_search_full.py deleted file mode 100644 index cc22c5f9..00000000 --- a/codex-lens/tests/test_vector_search_full.py +++ /dev/null @@ -1,812 +0,0 @@ -"""Full coverage tests for vector/semantic search functionality. - -Tests cover: -- Embedder model loading and embedding generation -- VectorStore CRUD operations and caching -- Cosine similarity computation -- Semantic search accuracy and relevance -- Performance benchmarks -- Edge cases and error handling -- Thread safety and concurrent access -""" - -import json -import math -import tempfile -import threading -import time -from pathlib import Path -from typing import List - -import pytest - -from codexlens.entities import SemanticChunk, Symbol, SearchResult -from codexlens.semantic import SEMANTIC_AVAILABLE, check_semantic_available - -# Only skip if NumPy is unavailable (some tests exercise vector math without fastembed). -try: - import numpy as np # noqa: F401 - NUMPY_AVAILABLE = True -except ImportError: - NUMPY_AVAILABLE = False - -# Skip all tests if NumPy is unavailable -pytestmark = pytest.mark.skipif( - not NUMPY_AVAILABLE, - reason="NumPy not installed (pip install codexlens[semantic])" -) - - -# === Fixtures === - -@pytest.fixture -def temp_db(tmp_path): - """Create temporary database path.""" - return tmp_path / "test_semantic.db" - - -@pytest.fixture -def embedder(): - """Create Embedder instance.""" - available, error = check_semantic_available() - if not available: - pytest.skip(error or "Semantic search dependencies not installed (pip install codexlens[semantic])") - from codexlens.semantic.embedder import Embedder - return Embedder() - - -@pytest.fixture -def vector_store(temp_db): - """Create VectorStore instance.""" - from codexlens.semantic.vector_store import VectorStore - return VectorStore(temp_db) - - -@pytest.fixture -def sample_code_chunks(): - """Sample code chunks for testing.""" - return [ - { - "content": "def authenticate(username, password): return check_credentials(username, password)", - "metadata": {"symbol_name": "authenticate", "symbol_kind": "function", "start_line": 1, "end_line": 1, "language": "python"}, - }, - { - "content": "class DatabaseConnection:\n def connect(self, host, port): pass\n def execute(self, query): pass", - "metadata": {"symbol_name": "DatabaseConnection", "symbol_kind": "class", "start_line": 1, "end_line": 3, "language": "python"}, - }, - { - "content": "async function fetchUserData(userId) { return await api.get('/users/' + userId); }", - "metadata": {"symbol_name": "fetchUserData", "symbol_kind": "function", "start_line": 1, "end_line": 1, "language": "javascript"}, - }, - { - "content": "def calculate_sum(numbers): return sum(numbers)", - "metadata": {"symbol_name": "calculate_sum", "symbol_kind": "function", "start_line": 1, "end_line": 1, "language": "python"}, - }, - { - "content": "class UserProfile:\n def __init__(self, name, email):\n self.name = name\n self.email = email", - "metadata": {"symbol_name": "UserProfile", "symbol_kind": "class", "start_line": 1, "end_line": 4, "language": "python"}, - }, - ] - - -# === Embedder Tests === - -class TestEmbedder: - """Tests for Embedder class.""" - - def test_embedder_initialization(self, embedder): - """Test embedder initializes correctly.""" - assert embedder.model_name == "BAAI/bge-small-en-v1.5" - assert embedder.embedding_dim == 384 - assert embedder._model is None # Lazy loading - - def test_embed_single_returns_correct_dimension(self, embedder): - """Test single embedding has correct dimension.""" - text = "def hello(): print('world')" - embedding = embedder.embed_single(text) - - assert isinstance(embedding, list) - assert len(embedding) == 384 - assert all(isinstance(x, float) for x in embedding) - - def test_embed_batch_returns_correct_count(self, embedder): - """Test batch embedding returns correct number of embeddings.""" - texts = [ - "def foo(): pass", - "def bar(): pass", - "def baz(): pass", - ] - embeddings = embedder.embed(texts) - - assert len(embeddings) == len(texts) - assert all(len(e) == 384 for e in embeddings) - - def test_embed_empty_string(self, embedder): - """Test embedding empty string.""" - embedding = embedder.embed_single("") - assert len(embedding) == 384 - - def test_embed_unicode_text(self, embedder): - """Test embedding unicode text.""" - text = "def 你好(): return '世界'" - embedding = embedder.embed_single(text) - assert len(embedding) == 384 - - def test_embed_long_text(self, embedder): - """Test embedding long text.""" - text = "def process(): pass\n" * 100 - embedding = embedder.embed_single(text) - assert len(embedding) == 384 - - def test_embed_special_characters(self, embedder): - """Test embedding text with special characters.""" - text = "def test(): return {'key': 'value', '@decorator': True}" - embedding = embedder.embed_single(text) - assert len(embedding) == 384 - - def test_lazy_model_loading(self, embedder): - """Test model loads lazily on first embed call.""" - assert embedder._model is None - embedder.embed_single("test") - assert embedder._model is not None - - def test_model_reuse(self, embedder): - """Test model is reused across multiple calls.""" - embedder.embed_single("test1") - model_ref = embedder._model - embedder.embed_single("test2") - assert embedder._model is model_ref # Same instance - - -class TestEmbeddingSimilarity: - """Tests for embedding similarity.""" - - def test_identical_text_similarity(self, embedder): - """Test identical text has similarity ~1.0.""" - from codexlens.semantic.vector_store import _cosine_similarity - - text = "def calculate_sum(a, b): return a + b" - emb1 = embedder.embed_single(text) - emb2 = embedder.embed_single(text) - - similarity = _cosine_similarity(emb1, emb2) - assert similarity > 0.99, "Identical text should have ~1.0 similarity" - - def test_similar_code_high_similarity(self, embedder): - """Test similar code has high similarity.""" - from codexlens.semantic.vector_store import _cosine_similarity - - code1 = "def add(a, b): return a + b" - code2 = "def sum_numbers(x, y): return x + y" - - emb1 = embedder.embed_single(code1) - emb2 = embedder.embed_single(code2) - - similarity = _cosine_similarity(emb1, emb2) - assert similarity > 0.6, "Similar functions should have high similarity" - - def test_different_code_lower_similarity(self, embedder): - """Test different code has lower similarity than similar code.""" - from codexlens.semantic.vector_store import _cosine_similarity - - code1 = "def add(a, b): return a + b" - code2 = "def sum_numbers(x, y): return x + y" - code3 = "class UserAuth: def login(self, user, pwd): pass" - - emb1 = embedder.embed_single(code1) - emb2 = embedder.embed_single(code2) - emb3 = embedder.embed_single(code3) - - sim_similar = _cosine_similarity(emb1, emb2) - sim_different = _cosine_similarity(emb1, emb3) - - assert sim_similar > sim_different, "Similar code should have higher similarity" - - def test_zero_vector_similarity(self): - """Test cosine similarity with zero vector.""" - from codexlens.semantic.vector_store import _cosine_similarity - - zero_vec = [0.0] * 384 - normal_vec = [1.0] * 384 - - similarity = _cosine_similarity(zero_vec, normal_vec) - assert similarity == 0.0, "Zero vector should have 0 similarity" - - def test_cosine_similarity_near_zero_norm_vectors(self): - """Near-zero norm vectors (< epsilon) should return 0.0 similarity.""" - from codexlens.semantic.vector_store import _cosine_similarity - - near_zero_vec = [1e-12] * 384 - normal_vec = [1.0] * 384 - - similarity = _cosine_similarity(near_zero_vec, normal_vec) - assert similarity == 0.0 - - def test_cosine_similarity_product_underflow_returns_zero(self): - """Product underflow (norm_a * norm_b < epsilon) should return 0.0.""" - from codexlens.semantic.vector_store import _cosine_similarity - - underflow_vec = [1e-7] * 384 - - similarity = _cosine_similarity(underflow_vec, underflow_vec) - assert similarity == 0.0 - - def test_cosine_similarity_small_valid_vectors(self): - """Small-but-valid vectors should compute similarity correctly.""" - from codexlens.semantic.vector_store import _cosine_similarity - - small_vec = [1e-6] * 384 - - similarity = _cosine_similarity(small_vec, small_vec) - assert similarity == pytest.approx(1.0) - - def test_cosine_similarity_no_inf_nan_results(self): - """Epsilon edge cases should never produce inf/nan results.""" - from codexlens.semantic.vector_store import _cosine_similarity - - cases = [ - ([0.0] * 384, [1.0] * 384), - ([1e-12] * 384, [1.0] * 384), - ([1e-7] * 384, [1e-7] * 384), - ([1e-6] * 384, [1e-6] * 384), - ([1.0] * 384, [1.0] * 384), - ] - - for a, b in cases: - similarity = _cosine_similarity(a, b) - assert math.isfinite(similarity) - - -# === VectorStore Tests === - -class TestVectorStoreCRUD: - """Tests for VectorStore CRUD operations.""" - - def test_add_chunk(self, vector_store, embedder): - """Test adding a single chunk.""" - chunk = SemanticChunk( - content="def test(): pass", - metadata={"language": "python"}, - ) - chunk.embedding = embedder.embed_single(chunk.content) - - chunk_id = vector_store.add_chunk(chunk, "/test/file.py") - - assert chunk_id > 0 - assert vector_store.count_chunks() == 1 - - def test_add_chunk_without_embedding_raises(self, vector_store): - """Test adding chunk without embedding raises error.""" - chunk = SemanticChunk(content="def test(): pass", metadata={}) - - with pytest.raises(ValueError, match="must have embedding"): - vector_store.add_chunk(chunk, "/test/file.py") - - def test_add_chunks_batch(self, vector_store, embedder, sample_code_chunks): - """Test batch adding chunks.""" - chunks = [] - for data in sample_code_chunks: - chunk = SemanticChunk(content=data["content"], metadata=data["metadata"]) - chunk.embedding = embedder.embed_single(chunk.content) - chunks.append(chunk) - - ids = vector_store.add_chunks(chunks, "/test/multi.py") - - assert len(ids) == len(chunks) - assert vector_store.count_chunks() == len(chunks) - - def test_add_empty_batch(self, vector_store): - """Test adding empty batch returns empty list.""" - ids = vector_store.add_chunks([], "/test/empty.py") - assert ids == [] - - def test_delete_file_chunks(self, vector_store, embedder): - """Test deleting chunks by file path.""" - # Add chunks for two files - chunk1 = SemanticChunk(content="def a(): pass", metadata={}) - chunk1.embedding = embedder.embed_single(chunk1.content) - vector_store.add_chunk(chunk1, "/test/file1.py") - - chunk2 = SemanticChunk(content="def b(): pass", metadata={}) - chunk2.embedding = embedder.embed_single(chunk2.content) - vector_store.add_chunk(chunk2, "/test/file2.py") - - assert vector_store.count_chunks() == 2 - - # Delete one file's chunks - deleted = vector_store.delete_file_chunks("/test/file1.py") - - assert deleted == 1 - assert vector_store.count_chunks() == 1 - - def test_delete_nonexistent_file(self, vector_store): - """Test deleting non-existent file returns 0.""" - deleted = vector_store.delete_file_chunks("/nonexistent/file.py") - assert deleted == 0 - - def test_count_chunks_empty(self, vector_store): - """Test count on empty store.""" - assert vector_store.count_chunks() == 0 - - -class TestVectorStoreSearch: - """Tests for VectorStore search functionality.""" - - def test_search_similar_basic(self, vector_store, embedder, sample_code_chunks): - """Test basic similarity search.""" - # Add chunks - for data in sample_code_chunks: - chunk = SemanticChunk(content=data["content"], metadata=data["metadata"]) - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, "/test/file.py") - - # Search - query = "function to authenticate user login" - query_embedding = embedder.embed_single(query) - results = vector_store.search_similar(query_embedding, top_k=3) - - assert len(results) > 0 - assert all(isinstance(r, SearchResult) for r in results) - # Top result should be auth-related - assert "authenticate" in results[0].excerpt.lower() or "auth" in results[0].path.lower() - - def test_search_respects_top_k(self, vector_store, embedder, sample_code_chunks): - """Test search respects top_k parameter.""" - # Add all chunks - for data in sample_code_chunks: - chunk = SemanticChunk(content=data["content"], metadata=data["metadata"]) - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, "/test/file.py") - - query_embedding = embedder.embed_single("code") - - results_2 = vector_store.search_similar(query_embedding, top_k=2) - results_5 = vector_store.search_similar(query_embedding, top_k=5) - - assert len(results_2) <= 2 - assert len(results_5) <= 5 - - def test_search_min_score_filtering(self, vector_store, embedder): - """Test min_score filtering.""" - chunk = SemanticChunk( - content="def hello(): print('hello world')", - metadata={}, - ) - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, "/test/hello.py") - - query_embedding = embedder.embed_single("database connection pool") - - results_no_filter = vector_store.search_similar(query_embedding, min_score=0.0) - results_high_filter = vector_store.search_similar(query_embedding, min_score=0.9) - - assert len(results_no_filter) >= len(results_high_filter) - - def test_search_returns_sorted_by_score(self, vector_store, embedder, sample_code_chunks): - """Test results are sorted by score descending.""" - for data in sample_code_chunks: - chunk = SemanticChunk(content=data["content"], metadata=data["metadata"]) - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, "/test/file.py") - - query_embedding = embedder.embed_single("function") - results = vector_store.search_similar(query_embedding, top_k=5) - - if len(results) > 1: - for i in range(len(results) - 1): - assert results[i].score >= results[i + 1].score - - def test_search_includes_metadata(self, vector_store, embedder): - """Test search results include metadata.""" - chunk = SemanticChunk( - content="def test_function(): pass", - metadata={ - "symbol_name": "test_function", - "symbol_kind": "function", - "start_line": 10, - "end_line": 15, - }, - ) - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, "/test/func.py") - - query_embedding = embedder.embed_single("test function") - results = vector_store.search_similar(query_embedding, top_k=1) - - assert len(results) == 1 - assert results[0].symbol_name == "test_function" - assert results[0].symbol_kind == "function" - assert results[0].start_line == 10 - assert results[0].end_line == 15 - - def test_search_empty_store_returns_empty(self, vector_store, embedder): - """Test search on empty store returns empty list.""" - query_embedding = embedder.embed_single("anything") - results = vector_store.search_similar(query_embedding) - assert results == [] - - def test_search_with_return_full_content_false(self, vector_store, embedder): - """Test search with return_full_content=False.""" - chunk = SemanticChunk( - content="def long_function(): " + "pass\n" * 100, - metadata={}, - ) - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, "/test/long.py") - - query_embedding = embedder.embed_single("function") - results = vector_store.search_similar( - query_embedding, top_k=1, return_full_content=False - ) - - assert len(results) == 1 - assert results[0].content is None - assert results[0].excerpt is not None - - -class TestVectorStoreCache: - """Tests for VectorStore caching behavior.""" - - def test_cache_invalidation_on_add(self, vector_store, embedder): - """Test cache is invalidated when chunks are added.""" - chunk1 = SemanticChunk(content="def a(): pass", metadata={}) - chunk1.embedding = embedder.embed_single(chunk1.content) - vector_store.add_chunk(chunk1, "/test/a.py") - - # Trigger cache population - query_embedding = embedder.embed_single("function") - vector_store.search_similar(query_embedding) - - initial_version = vector_store._cache_version - - # Add another chunk - chunk2 = SemanticChunk(content="def b(): pass", metadata={}) - chunk2.embedding = embedder.embed_single(chunk2.content) - vector_store.add_chunk(chunk2, "/test/b.py") - - assert vector_store._cache_version > initial_version - assert vector_store._embedding_matrix is None - - def test_cache_invalidation_on_delete(self, vector_store, embedder): - """Test cache is invalidated when chunks are deleted.""" - chunk = SemanticChunk(content="def a(): pass", metadata={}) - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, "/test/a.py") - - # Trigger cache population - query_embedding = embedder.embed_single("function") - vector_store.search_similar(query_embedding) - - initial_version = vector_store._cache_version - - # Delete chunk - vector_store.delete_file_chunks("/test/a.py") - - assert vector_store._cache_version > initial_version - - def test_manual_cache_clear(self, vector_store, embedder): - """Test manual cache clearing.""" - chunk = SemanticChunk(content="def a(): pass", metadata={}) - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, "/test/a.py") - - # Force brute-force mode to populate cache (disable ANN) - original_ann = vector_store._ann_index - vector_store._ann_index = None - - # Trigger cache population - query_embedding = embedder.embed_single("function") - vector_store.search_similar(query_embedding) - - assert vector_store._embedding_matrix is not None - - vector_store.clear_cache() - - assert vector_store._embedding_matrix is None - - # Restore ANN index - vector_store._ann_index = original_ann - - -# === Semantic Search Accuracy Tests === - -class TestSemanticSearchAccuracy: - """Tests for semantic search accuracy and relevance.""" - - def test_auth_query_finds_auth_code(self, vector_store, embedder, sample_code_chunks): - """Test authentication query finds auth code.""" - for data in sample_code_chunks: - chunk = SemanticChunk(content=data["content"], metadata=data["metadata"]) - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, "/test/file.py") - - query = "user authentication login" - query_embedding = embedder.embed_single(query) - results = vector_store.search_similar(query_embedding, top_k=1) - - assert len(results) > 0 - assert "authenticate" in results[0].excerpt.lower() - - def test_database_query_finds_db_code(self, vector_store, embedder, sample_code_chunks): - """Test database query finds database code.""" - for data in sample_code_chunks: - chunk = SemanticChunk(content=data["content"], metadata=data["metadata"]) - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, "/test/file.py") - - query = "database connection execute query" - query_embedding = embedder.embed_single(query) - results = vector_store.search_similar(query_embedding, top_k=1) - - assert len(results) > 0 - assert "database" in results[0].excerpt.lower() or "connect" in results[0].excerpt.lower() - - def test_math_query_finds_calculation_code(self, vector_store, embedder, sample_code_chunks): - """Test math query finds calculation code.""" - for data in sample_code_chunks: - chunk = SemanticChunk(content=data["content"], metadata=data["metadata"]) - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, "/test/file.py") - - query = "sum numbers add calculation" - query_embedding = embedder.embed_single(query) - results = vector_store.search_similar(query_embedding, top_k=1) - - assert len(results) > 0 - assert "sum" in results[0].excerpt.lower() or "calculate" in results[0].excerpt.lower() - - -# === Performance Tests === - -class TestVectorSearchPerformance: - """Performance tests for vector search.""" - - def test_embedding_performance(self, embedder): - """Test embedding generation performance.""" - text = "def calculate_sum(a, b): return a + b" - - # Warm up - embedder.embed_single(text) - - # Measure - start = time.perf_counter() - iterations = 10 - for _ in range(iterations): - embedder.embed_single(text) - elapsed = time.perf_counter() - start - - avg_ms = (elapsed / iterations) * 1000 - assert avg_ms < 100, f"Single embedding should be <100ms, got {avg_ms:.2f}ms" - - def test_batch_embedding_performance(self, embedder): - """Test batch embedding performance.""" - texts = [f"def function_{i}(): pass" for i in range(50)] - - # Warm up - embedder.embed(texts[:5]) - - # Measure - start = time.perf_counter() - embedder.embed(texts) - elapsed = time.perf_counter() - start - - total_ms = elapsed * 1000 - per_text_ms = total_ms / len(texts) - assert per_text_ms < 20, f"Per-text embedding should be <20ms, got {per_text_ms:.2f}ms" - - def test_search_performance_small(self, vector_store, embedder): - """Test search performance with small dataset.""" - # Add 100 chunks - for i in range(100): - chunk = SemanticChunk( - content=f"def function_{i}(): return {i}", - metadata={"index": i}, - ) - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, f"/test/file_{i}.py") - - query_embedding = embedder.embed_single("function return value") - - # Warm up - vector_store.search_similar(query_embedding) - - # Measure - start = time.perf_counter() - iterations = 10 - for _ in range(iterations): - vector_store.search_similar(query_embedding) - elapsed = time.perf_counter() - start - - avg_ms = (elapsed / iterations) * 1000 - assert avg_ms < 50, f"Search with 100 chunks should be <50ms, got {avg_ms:.2f}ms" - - def test_search_performance_medium(self, vector_store, embedder): - """Test search performance with medium dataset.""" - # Add 500 chunks in batch - chunks = [] - for i in range(500): - chunk = SemanticChunk( - content=f"def function_{i}(x): return x * {i}", - metadata={"index": i}, - ) - chunk.embedding = embedder.embed_single(chunk.content) - chunks.append(chunk) - - vector_store.add_chunks(chunks, "/test/bulk.py") - - query_embedding = embedder.embed_single("multiply value") - - # Warm up - vector_store.search_similar(query_embedding) - - # Measure - start = time.perf_counter() - iterations = 5 - for _ in range(iterations): - vector_store.search_similar(query_embedding) - elapsed = time.perf_counter() - start - - avg_ms = (elapsed / iterations) * 1000 - assert avg_ms < 100, f"Search with 500 chunks should be <100ms, got {avg_ms:.2f}ms" - - -# === Thread Safety Tests === - -class TestThreadSafety: - """Tests for thread safety.""" - - def test_concurrent_searches(self, vector_store, embedder, sample_code_chunks): - """Test concurrent searches are thread-safe.""" - # Populate store - for data in sample_code_chunks: - chunk = SemanticChunk(content=data["content"], metadata=data["metadata"]) - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, "/test/file.py") - - results_list = [] - errors = [] - - def search_task(query): - try: - query_embedding = embedder.embed_single(query) - results = vector_store.search_similar(query_embedding, top_k=3) - results_list.append(len(results)) - except Exception as e: - errors.append(str(e)) - - queries = ["authentication", "database", "function", "class", "async"] - threads = [threading.Thread(target=search_task, args=(q,)) for q in queries] - - for t in threads: - t.start() - for t in threads: - t.join() - - assert len(errors) == 0, f"Errors during concurrent search: {errors}" - assert len(results_list) == len(queries) - - def test_concurrent_add_and_search(self, vector_store, embedder): - """Test concurrent add and search operations.""" - errors = [] - - def add_task(idx): - try: - chunk = SemanticChunk( - content=f"def task_{idx}(): pass", - metadata={"idx": idx}, - ) - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, f"/test/task_{idx}.py") - except Exception as e: - errors.append(f"Add error: {e}") - - def search_task(): - try: - query_embedding = embedder.embed_single("function task") - vector_store.search_similar(query_embedding) - except Exception as e: - errors.append(f"Search error: {e}") - - threads = [] - for i in range(10): - threads.append(threading.Thread(target=add_task, args=(i,))) - threads.append(threading.Thread(target=search_task)) - - for t in threads: - t.start() - for t in threads: - t.join() - - assert len(errors) == 0, f"Errors during concurrent ops: {errors}" - - -# === Edge Cases === - -class TestEdgeCases: - """Tests for edge cases.""" - - def test_very_short_content(self, vector_store, embedder): - """Test handling very short content.""" - chunk = SemanticChunk(content="x", metadata={}) - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, "/test/short.py") - - query_embedding = embedder.embed_single("x") - results = vector_store.search_similar(query_embedding) - - assert len(results) == 1 - - def test_special_characters_in_path(self, vector_store, embedder): - """Test handling special characters in file path.""" - chunk = SemanticChunk(content="def test(): pass", metadata={}) - chunk.embedding = embedder.embed_single(chunk.content) - - special_path = "/test/path with spaces/file-name_v2.py" - vector_store.add_chunk(chunk, special_path) - - query_embedding = embedder.embed_single("test function") - results = vector_store.search_similar(query_embedding) - - assert len(results) == 1 - assert results[0].path == special_path - - def test_json_metadata_special_chars(self, vector_store, embedder): - """Test metadata with special JSON characters.""" - metadata = { - "description": 'Test "quoted" text with \'single\' quotes', - "path": "C:\\Users\\test\\file.py", - "tags": ["tag1", "tag2"], - } - chunk = SemanticChunk(content="def test(): pass", metadata=metadata) - chunk.embedding = embedder.embed_single(chunk.content) - - vector_store.add_chunk(chunk, "/test/special.py") - - query_embedding = embedder.embed_single("test") - results = vector_store.search_similar(query_embedding) - - assert len(results) == 1 - assert results[0].metadata["description"] == metadata["description"] - - def test_search_zero_top_k(self, vector_store, embedder): - """Test search with top_k=0.""" - chunk = SemanticChunk(content="def test(): pass", metadata={}) - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, "/test/file.py") - - query_embedding = embedder.embed_single("test") - results = vector_store.search_similar(query_embedding, top_k=0) - - assert results == [] - - def test_search_very_high_min_score(self, vector_store, embedder): - """Test search with very high min_score filters all results.""" - chunk = SemanticChunk(content="def hello(): print('world')", metadata={}) - chunk.embedding = embedder.embed_single(chunk.content) - vector_store.add_chunk(chunk, "/test/hello.py") - - # Query something unrelated with very high threshold - query_embedding = embedder.embed_single("database connection") - results = vector_store.search_similar(query_embedding, min_score=0.99) - - # Should filter out since unrelated - assert len(results) == 0 - - -# === Availability Check Tests === - -class TestAvailabilityCheck: - """Tests for semantic availability checking.""" - - def test_check_semantic_available(self): - """Test check_semantic_available function.""" - available, error = check_semantic_available() - assert available is SEMANTIC_AVAILABLE - if available: - assert error is None - else: - assert error is not None - - def test_semantic_available_flag(self): - """Test SEMANTIC_AVAILABLE flag is True when deps installed.""" - assert isinstance(SEMANTIC_AVAILABLE, bool) diff --git a/codex-lens/tests/test_vector_store.py b/codex-lens/tests/test_vector_store.py deleted file mode 100644 index 3da2ab0f..00000000 --- a/codex-lens/tests/test_vector_store.py +++ /dev/null @@ -1,386 +0,0 @@ -import sqlite3 -import sys -import tempfile -import threading -import time -from pathlib import Path - -import numpy as np -import pytest - -from codexlens.entities import SemanticChunk -import codexlens.semantic.vector_store as vector_store_module -from codexlens.semantic.vector_store import VectorStore - - -@pytest.fixture() -def temp_db(): - with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdir: - yield Path(tmpdir) / "semantic.db" - - -def test_concurrent_bulk_insert(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None: - """Concurrent batch inserts in bulk mode should not corrupt accumulation state.""" - store = VectorStore(temp_db) - monkeypatch.setattr(store, "_ensure_ann_index", lambda dim: True) - - store.begin_bulk_insert() - - errors: list[Exception] = [] - lock = threading.Lock() - threads: list[threading.Thread] = [] - - def make_chunks(count: int, dim: int) -> list[SemanticChunk]: - chunks: list[SemanticChunk] = [] - for i in range(count): - chunk = SemanticChunk(content=f"chunk {i}", metadata={}) - chunk.embedding = np.random.randn(dim).astype(np.float32) - chunks.append(chunk) - return chunks - - def worker(idx: int) -> None: - try: - dim = 8 - if idx % 2 == 0: - chunks = make_chunks(5, dim) - store.add_chunks_batch([(c, f"file_{idx}.py") for c in chunks], auto_save_ann=False) - else: - chunks = [SemanticChunk(content=f"chunk {i}") for i in range(5)] - embeddings = np.random.randn(5, dim).astype(np.float32) - store.add_chunks_batch_numpy( - [(c, f"file_{idx}.py") for c in chunks], - embeddings_matrix=embeddings, - auto_save_ann=False, - ) - except Exception as exc: - with lock: - errors.append(exc) - - for i in range(10): - threads.append(threading.Thread(target=worker, args=(i,))) - - for t in threads: - t.start() - for t in threads: - t.join() - - assert not errors - assert len(store._bulk_insert_ids) == 50 - assert len(store._bulk_insert_embeddings) == 50 - assert store.count_chunks() == 50 - - -def test_bulk_insert_mode_transitions(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None: - """begin/end bulk insert should be thread-safe with concurrent add operations.""" - store = VectorStore(temp_db) - - class DummyAnn: - def __init__(self) -> None: - self.total_added = 0 - self.save_calls = 0 - - def add_vectors(self, ids, embeddings) -> None: - self.total_added += len(ids) - - def save(self) -> None: - self.save_calls += 1 - - dummy_ann = DummyAnn() - store._ann_index = dummy_ann - monkeypatch.setattr(store, "_ensure_ann_index", lambda dim: True) - - errors: list[Exception] = [] - lock = threading.Lock() - stop_event = threading.Event() - - def adder(worker_id: int) -> None: - try: - while not stop_event.is_set(): - chunk = SemanticChunk(content=f"chunk {worker_id}", metadata={}) - chunk.embedding = np.random.randn(8).astype(np.float32) - store.add_chunks_batch([(chunk, f"file_{worker_id}.py")], auto_save_ann=False) - except Exception as exc: - with lock: - errors.append(exc) - - def toggler() -> None: - try: - for _ in range(5): - store.begin_bulk_insert() - time.sleep(0.05) - store.end_bulk_insert() - time.sleep(0.05) - except Exception as exc: - with lock: - errors.append(exc) - - threads = [threading.Thread(target=adder, args=(i,)) for i in range(3)] - toggle_thread = threading.Thread(target=toggler) - - for t in threads: - t.start() - toggle_thread.start() - - toggle_thread.join(timeout=10) - stop_event.set() - for t in threads: - t.join(timeout=10) - - assert not errors - assert toggle_thread.is_alive() is False - assert store._bulk_insert_mode is False - assert store._bulk_insert_ids == [] - assert store._bulk_insert_embeddings == [] - assert dummy_ann.total_added == store.count_chunks() - - -def test_search_similar_min_score_validation(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None: - """search_similar should validate min_score is within [0.0, 1.0].""" - monkeypatch.setattr(vector_store_module, "HNSWLIB_AVAILABLE", False) - store = VectorStore(temp_db) - - chunk_a = SemanticChunk(content="chunk A", metadata={}) - chunk_a.embedding = np.array([1.0, 0.0, 0.0], dtype=np.float32) - chunk_b = SemanticChunk(content="chunk B", metadata={}) - chunk_b.embedding = np.array([0.0, 1.0, 0.0], dtype=np.float32) - store.add_chunks_batch([(chunk_a, "a.py"), (chunk_b, "b.py")]) - - query = [1.0, 0.0, 0.0] - - with pytest.raises(ValueError, match=r"min_score.*\[0\.0, 1\.0\].*cosine"): - store.search_similar(query, min_score=-0.5) - - with pytest.raises(ValueError, match=r"min_score.*\[0\.0, 1\.0\].*cosine"): - store.search_similar(query, min_score=1.5) - - store.search_similar(query, min_score=0.0) - store.search_similar(query, min_score=1.0) - - results = store.search_similar(query, min_score=0.5, return_full_content=False) - assert [r.path for r in results] == ["a.py"] - - -def test_search_similar(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None: - """search_similar returns results ordered by descending similarity.""" - monkeypatch.setattr(vector_store_module, "HNSWLIB_AVAILABLE", False) - store = VectorStore(temp_db) - - chunk_a = SemanticChunk(content="chunk A", metadata={}) - chunk_a.embedding = np.array([1.0, 0.0, 0.0], dtype=np.float32) - chunk_b = SemanticChunk(content="chunk B", metadata={}) - chunk_b.embedding = np.array([0.0, 1.0, 0.0], dtype=np.float32) - store.add_chunks_batch([(chunk_a, "a.py"), (chunk_b, "b.py")]) - - results = store.search_similar([1.0, 0.0, 0.0], top_k=10, min_score=0.0, return_full_content=False) - - assert [r.path for r in results] == ["a.py", "b.py"] - assert results[0].score == pytest.approx(1.0) - assert results[1].score == pytest.approx(0.0) - - -def test_search_with_ann_null_results(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None: - """_search_with_ann should return [] when ANN search returns null results.""" - monkeypatch.setattr(vector_store_module, "HNSWLIB_AVAILABLE", False) - store = VectorStore(temp_db) - - class DummyAnn: - def count(self) -> int: - return 1 - - def search(self, query_vec: np.ndarray, top_k: int): - return None, None - - store._ann_index = DummyAnn() - - results = store._search_with_ann(np.array([1.0, 0.0, 0.0], dtype=np.float32), top_k=10, min_score=0.0, return_full_content=False) - assert results == [] - - -def test_search_with_ann_empty_results(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None: - """_search_with_ann should return [] when ANN search returns empty results.""" - monkeypatch.setattr(vector_store_module, "HNSWLIB_AVAILABLE", False) - store = VectorStore(temp_db) - - class DummyAnn: - def count(self) -> int: - return 1 - - def search(self, query_vec: np.ndarray, top_k: int): - return [], [] - - store._ann_index = DummyAnn() - - results = store._search_with_ann(np.array([1.0, 0.0, 0.0], dtype=np.float32), top_k=10, min_score=0.0, return_full_content=False) - assert results == [] - - -def test_search_with_ann_mismatched_results(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None: - """_search_with_ann should return [] when ANN search returns mismatched results.""" - monkeypatch.setattr(vector_store_module, "HNSWLIB_AVAILABLE", False) - store = VectorStore(temp_db) - - class DummyAnn: - def count(self) -> int: - return 2 - - def search(self, query_vec: np.ndarray, top_k: int): - return [1, 2], [0.5] - - store._ann_index = DummyAnn() - - results = store._search_with_ann(np.array([1.0, 0.0, 0.0], dtype=np.float32), top_k=10, min_score=0.0, return_full_content=False) - assert results == [] - - -def test_search_with_ann_valid_results(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None: - """_search_with_ann should return results for valid ANN outputs.""" - monkeypatch.setattr(vector_store_module, "HNSWLIB_AVAILABLE", False) - store = VectorStore(temp_db) - - chunk = SemanticChunk(content="chunk A", metadata={}) - chunk.embedding = np.array([1.0, 0.0, 0.0], dtype=np.float32) - chunk_id = store.add_chunk(chunk, "a.py") - - class DummyAnn: - def count(self) -> int: - return 1 - - def search(self, query_vec: np.ndarray, top_k: int): - return [chunk_id], [0.0] - - store._ann_index = DummyAnn() - - results = store._search_with_ann(np.array([1.0, 0.0, 0.0], dtype=np.float32), top_k=10, min_score=0.0, return_full_content=False) - assert [r.path for r in results] == ["a.py"] - assert results[0].score == pytest.approx(1.0) - - -def test_add_chunks_batch_overflow(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None: - """add_chunks_batch should fail fast when generated IDs would exceed SQLite/sys bounds.""" - monkeypatch.setattr(vector_store_module, "HNSWLIB_AVAILABLE", False) - store = VectorStore(temp_db) - - seed_embedding = np.array([1.0, 0.0, 0.0], dtype=np.float32).tobytes() - with sqlite3.connect(store.db_path) as conn: - conn.execute( - "INSERT INTO semantic_chunks (id, file_path, content, embedding, metadata) VALUES (?, ?, ?, ?, ?)", - (sys.maxsize - 5, "seed.py", "seed", seed_embedding, None), - ) - conn.commit() - - chunks_with_paths: list[tuple[SemanticChunk, str]] = [] - for i in range(10): - chunks_with_paths.append( - ( - SemanticChunk(content=f"chunk {i}", metadata={}, embedding=[1.0, 0.0, 0.0]), - f"file_{i}.py", - ) - ) - - with pytest.raises(ValueError, match=r"Chunk ID range overflow"): - store.add_chunks_batch(chunks_with_paths) - - -def test_add_chunks_batch_generates_sequential_ids(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None: - """add_chunks_batch should return sequential IDs for a fresh store.""" - monkeypatch.setattr(vector_store_module, "HNSWLIB_AVAILABLE", False) - store = VectorStore(temp_db) - - chunks_with_paths = [ - (SemanticChunk(content="chunk A", metadata={}, embedding=[1.0, 0.0, 0.0]), "a.py"), - (SemanticChunk(content="chunk B", metadata={}, embedding=[0.0, 1.0, 0.0]), "b.py"), - ] - - ids = store.add_chunks_batch(chunks_with_paths, update_ann=False) - assert ids == [1, 2] - assert store.count_chunks() == 2 - - -def test_add_chunks_batch_numpy_overflow(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None: - """add_chunks_batch_numpy should fail fast when generated IDs would exceed SQLite/sys bounds.""" - monkeypatch.setattr(vector_store_module, "HNSWLIB_AVAILABLE", False) - store = VectorStore(temp_db) - - seed_embedding = np.array([1.0, 0.0, 0.0], dtype=np.float32).tobytes() - with sqlite3.connect(store.db_path) as conn: - conn.execute( - "INSERT INTO semantic_chunks (id, file_path, content, embedding, metadata) VALUES (?, ?, ?, ?, ?)", - (sys.maxsize - 5, "seed.py", "seed", seed_embedding, None), - ) - conn.commit() - - chunks_with_paths = [ - (SemanticChunk(content=f"chunk {i}", metadata={}), f"file_{i}.py") - for i in range(10) - ] - embeddings = np.random.randn(10, 3).astype(np.float32) - - with pytest.raises(ValueError, match=r"Chunk ID range overflow"): - store.add_chunks_batch_numpy(chunks_with_paths, embeddings) - - -def test_fetch_results_by_ids(monkeypatch: pytest.MonkeyPatch, temp_db: Path) -> None: - """_fetch_results_by_ids should use parameterized IN queries and return ordered results.""" - store = VectorStore(temp_db) - - calls: list[tuple[str, str, object]] = [] - rows = [ - (1, "a.py", "content A", None), - (2, "b.py", "content B", None), - ] - - class DummyCursor: - def __init__(self, result_rows): - self._rows = result_rows - - def fetchall(self): - return self._rows - - class DummyConn: - def __enter__(self): - return self - - def __exit__(self, exc_type, exc, tb): - return False - - def execute(self, query, params=None): - if isinstance(query, str) and query.strip().upper().startswith("PRAGMA"): - calls.append(("pragma", query, params)) - return DummyCursor([]) - calls.append(("query", query, params)) - return DummyCursor(rows) - - monkeypatch.setattr(vector_store_module.sqlite3, "connect", lambda _: DummyConn()) - - chunk_ids = [1, 2] - scores = [0.9, 0.8] - results = store._fetch_results_by_ids(chunk_ids, scores, return_full_content=False) - - assert [r.path for r in results] == ["a.py", "b.py"] - assert [r.score for r in results] == scores - assert all(r.content is None for r in results) - - assert any(kind == "pragma" for kind, _, _ in calls) - _, query, params = next((c for c in calls if c[0] == "query"), ("", "", None)) - expected_query = """ - SELECT id, file_path, content, metadata - FROM semantic_chunks - WHERE id IN ({placeholders}) - """.format(placeholders=",".join("?" * len(chunk_ids))) - assert query == expected_query - assert params == chunk_ids - - assert store._fetch_results_by_ids([], [], return_full_content=False) == [] - - -def test_fetch_results_sql_safety() -> None: - """Placeholder generation and validation should prevent unsafe SQL interpolation.""" - for count in (0, 1, 10, 100): - placeholders = ",".join("?" * count) - vector_store_module._validate_sql_placeholders(placeholders, count) - - with pytest.raises(ValueError): - vector_store_module._validate_sql_placeholders("?,?); DROP TABLE semantic_chunks;--", 2) - - with pytest.raises(ValueError): - vector_store_module._validate_sql_placeholders("?,?", 3) diff --git a/codex-lens/tests/test_watcher/__init__.py b/codex-lens/tests/test_watcher/__init__.py deleted file mode 100644 index f736461b..00000000 --- a/codex-lens/tests/test_watcher/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for watcher module.""" diff --git a/codex-lens/tests/test_watcher/conftest.py b/codex-lens/tests/test_watcher/conftest.py deleted file mode 100644 index f3fcecfb..00000000 --- a/codex-lens/tests/test_watcher/conftest.py +++ /dev/null @@ -1,43 +0,0 @@ -"""Fixtures for watcher tests.""" - -from __future__ import annotations - -import tempfile -from pathlib import Path -from typing import Generator - -import pytest - - -@pytest.fixture -def temp_project() -> Generator[Path, None, None]: - """Create a temporary project directory with sample files.""" - with tempfile.TemporaryDirectory() as tmpdir: - project = Path(tmpdir) - - # Create sample Python file - py_file = project / "main.py" - py_file.write_text("def hello():\n print('Hello')\n") - - # Create sample JavaScript file - js_file = project / "app.js" - js_file.write_text("function greet() {\n console.log('Hi');\n}\n") - - # Create subdirectory with file - sub_dir = project / "src" - sub_dir.mkdir() - (sub_dir / "utils.py").write_text("def add(a, b):\n return a + b\n") - - # Create ignored directory - git_dir = project / ".git" - git_dir.mkdir() - (git_dir / "config").write_text("[core]\n") - - yield project - - -@pytest.fixture -def watcher_config(): - """Create default watcher configuration.""" - from codexlens.watcher import WatcherConfig - return WatcherConfig(debounce_ms=100) # Short debounce for tests diff --git a/codex-lens/tests/test_watcher/test_events.py b/codex-lens/tests/test_watcher/test_events.py deleted file mode 100644 index c3f3a53f..00000000 --- a/codex-lens/tests/test_watcher/test_events.py +++ /dev/null @@ -1,103 +0,0 @@ -"""Tests for watcher event types.""" - -from __future__ import annotations - -import time -from pathlib import Path - -import pytest - -from codexlens.watcher import ChangeType, FileEvent, WatcherConfig, IndexResult, WatcherStats - - -class TestChangeType: - """Tests for ChangeType enum.""" - - def test_change_types_exist(self): - """Verify all change types are defined.""" - assert ChangeType.CREATED.value == "created" - assert ChangeType.MODIFIED.value == "modified" - assert ChangeType.DELETED.value == "deleted" - assert ChangeType.MOVED.value == "moved" - - def test_change_type_count(self): - """Verify we have exactly 4 change types.""" - assert len(ChangeType) == 4 - - -class TestFileEvent: - """Tests for FileEvent dataclass.""" - - def test_create_event(self): - """Test creating a file event.""" - event = FileEvent( - path=Path("/test/file.py"), - change_type=ChangeType.CREATED, - timestamp=time.time(), - ) - assert event.path == Path("/test/file.py") - assert event.change_type == ChangeType.CREATED - assert event.old_path is None - - def test_moved_event(self): - """Test creating a moved event with old_path.""" - event = FileEvent( - path=Path("/test/new.py"), - change_type=ChangeType.MOVED, - timestamp=time.time(), - old_path=Path("/test/old.py"), - ) - assert event.old_path == Path("/test/old.py") - - -class TestWatcherConfig: - """Tests for WatcherConfig dataclass.""" - - def test_default_config(self): - """Test default configuration values.""" - config = WatcherConfig() - assert config.debounce_ms == 1000 - assert ".git" in config.ignored_patterns - assert "node_modules" in config.ignored_patterns - assert "__pycache__" in config.ignored_patterns - assert config.languages is None - - def test_custom_debounce(self): - """Test custom debounce setting.""" - config = WatcherConfig(debounce_ms=500) - assert config.debounce_ms == 500 - - -class TestIndexResult: - """Tests for IndexResult dataclass.""" - - def test_default_result(self): - """Test default result values.""" - result = IndexResult() - assert result.files_indexed == 0 - assert result.files_removed == 0 - assert result.symbols_added == 0 - assert result.errors == [] - - def test_custom_result(self): - """Test creating result with values.""" - result = IndexResult( - files_indexed=5, - files_removed=2, - symbols_added=50, - errors=["error1"], - ) - assert result.files_indexed == 5 - assert result.files_removed == 2 - - -class TestWatcherStats: - """Tests for WatcherStats dataclass.""" - - def test_default_stats(self): - """Test default stats values.""" - stats = WatcherStats() - assert stats.files_watched == 0 - assert stats.events_processed == 0 - assert stats.last_event_time is None - assert stats.is_running is False diff --git a/codex-lens/tests/test_watcher/test_file_watcher.py b/codex-lens/tests/test_watcher/test_file_watcher.py deleted file mode 100644 index 50aa352a..00000000 --- a/codex-lens/tests/test_watcher/test_file_watcher.py +++ /dev/null @@ -1,124 +0,0 @@ -"""Tests for FileWatcher class.""" - -from __future__ import annotations - -import time -from pathlib import Path -from typing import List - -import pytest - -from codexlens.watcher import FileWatcher, WatcherConfig, FileEvent, ChangeType - - -class TestFileWatcherInit: - """Tests for FileWatcher initialization.""" - - def test_init_with_valid_path(self, temp_project: Path, watcher_config: WatcherConfig): - """Test initializing with valid path.""" - events: List[FileEvent] = [] - watcher = FileWatcher(temp_project, watcher_config, lambda e: events.extend(e)) - - assert watcher.root_path == temp_project.resolve() - assert watcher.config == watcher_config - assert not watcher.is_running - - def test_start_with_invalid_path(self, watcher_config: WatcherConfig): - """Test starting watcher with non-existent path.""" - events: List[FileEvent] = [] - watcher = FileWatcher(Path("/nonexistent/path"), watcher_config, lambda e: events.extend(e)) - - with pytest.raises(ValueError, match="does not exist"): - watcher.start() - - -class TestFileWatcherLifecycle: - """Tests for FileWatcher start/stop lifecycle.""" - - def test_start_stop(self, temp_project: Path, watcher_config: WatcherConfig): - """Test basic start and stop.""" - events: List[FileEvent] = [] - watcher = FileWatcher(temp_project, watcher_config, lambda e: events.extend(e)) - - watcher.start() - assert watcher.is_running - - watcher.stop() - assert not watcher.is_running - - def test_double_start(self, temp_project: Path, watcher_config: WatcherConfig): - """Test calling start twice.""" - events: List[FileEvent] = [] - watcher = FileWatcher(temp_project, watcher_config, lambda e: events.extend(e)) - - watcher.start() - watcher.start() # Should not raise - assert watcher.is_running - - watcher.stop() - - def test_double_stop(self, temp_project: Path, watcher_config: WatcherConfig): - """Test calling stop twice.""" - events: List[FileEvent] = [] - watcher = FileWatcher(temp_project, watcher_config, lambda e: events.extend(e)) - - watcher.start() - watcher.stop() - watcher.stop() # Should not raise - assert not watcher.is_running - - -class TestFileWatcherEvents: - """Tests for FileWatcher event detection.""" - - def test_detect_file_creation(self, temp_project: Path, watcher_config: WatcherConfig): - """Test detecting new file creation.""" - events: List[FileEvent] = [] - watcher = FileWatcher(temp_project, watcher_config, lambda e: events.extend(e)) - - try: - watcher.start() - time.sleep(0.3) # Let watcher start (longer for Windows) - - # Create new file - new_file = temp_project / "new_file.py" - new_file.write_text("# New file\n") - - # Wait for event with retries (watchdog timing varies by platform) - max_wait = 2.0 - waited = 0.0 - while waited < max_wait: - time.sleep(0.2) - waited += 0.2 - # Windows may report MODIFIED instead of CREATED - file_events = [e for e in events if e.change_type in (ChangeType.CREATED, ChangeType.MODIFIED)] - if any(e.path.name == "new_file.py" for e in file_events): - break - - # Check event was detected (Windows may report MODIFIED instead of CREATED) - relevant_events = [e for e in events if e.change_type in (ChangeType.CREATED, ChangeType.MODIFIED)] - assert len(relevant_events) >= 1, f"Expected file event, got: {events}" - assert any(e.path.name == "new_file.py" for e in relevant_events) - finally: - watcher.stop() - - def test_filter_ignored_directories(self, temp_project: Path, watcher_config: WatcherConfig): - """Test that files in ignored directories are filtered.""" - events: List[FileEvent] = [] - watcher = FileWatcher(temp_project, watcher_config, lambda e: events.extend(e)) - - try: - watcher.start() - time.sleep(0.1) - - # Create file in .git (should be ignored) - git_file = temp_project / ".git" / "test.py" - git_file.write_text("# In git\n") - - time.sleep(watcher_config.debounce_ms / 1000.0 + 0.2) - - # No events should be detected for .git files - git_events = [e for e in events if ".git" in str(e.path)] - assert len(git_events) == 0 - finally: - watcher.stop() diff --git a/codex-lens/tests/unit/__init__.py b/codex-lens/tests/unit/__init__.py deleted file mode 100644 index 4a5d2636..00000000 --- a/codex-lens/tests/unit/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Unit tests package diff --git a/codex-lens/tests/unit/lsp/__init__.py b/codex-lens/tests/unit/lsp/__init__.py deleted file mode 100644 index 645c88fe..00000000 --- a/codex-lens/tests/unit/lsp/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# LSP unit tests package diff --git a/codex-lens/tests/unit/lsp/test_lsp_bridge.py b/codex-lens/tests/unit/lsp/test_lsp_bridge.py deleted file mode 100644 index 2c607655..00000000 --- a/codex-lens/tests/unit/lsp/test_lsp_bridge.py +++ /dev/null @@ -1,879 +0,0 @@ -"""Unit tests for LspBridge service (VSCode Bridge HTTP mode). - -This module provides comprehensive tests for the LspBridge class when used -in VSCode Bridge HTTP mode (use_vscode_bridge=True). These tests mock -aiohttp HTTP communication with the VSCode Bridge extension. - -Test coverage: -- P0 (Critical): Success/failure scenarios for core methods -- P1 (Important): Cache hit/miss and invalidation logic -- P2 (Supplementary): Edge cases and error handling - -Note: For standalone mode tests (direct language server communication), -see tests/real/ directory. -""" - -from __future__ import annotations - -import asyncio -import time -from typing import Any, Dict, List -from unittest.mock import AsyncMock, MagicMock, patch - -import pytest - -# Skip all tests if aiohttp is not available -pytest.importorskip("aiohttp") - -import aiohttp - -from codexlens.hybrid_search.data_structures import ( - CallHierarchyItem, - CodeSymbolNode, - Range, -) -from codexlens.lsp.lsp_bridge import ( - CacheEntry, - Location, - LspBridge, -) - - -# ----------------------------------------------------------------------------- -# Fixtures -# ----------------------------------------------------------------------------- - - -@pytest.fixture -def sample_symbol() -> CodeSymbolNode: - """Create a sample CodeSymbolNode for testing. - - Returns: - CodeSymbolNode with typical function symbol data. - """ - return CodeSymbolNode( - id="test.py:test_func:10", - name="test_func", - kind="function", - file_path="/path/to/test.py", - range=Range( - start_line=10, - start_character=1, - end_line=20, - end_character=1, - ), - ) - - -@pytest.fixture -def mock_response() -> AsyncMock: - """Create a mock aiohttp response with configurable attributes. - - Returns: - AsyncMock configured as aiohttp ClientResponse. - """ - response = AsyncMock() - response.status = 200 - response.json = AsyncMock(return_value={"success": True, "result": []}) - return response - - -@pytest.fixture -def mock_session(mock_response: AsyncMock) -> AsyncMock: - """Create a mock aiohttp ClientSession. - - Args: - mock_response: The mock response to return from post(). - - Returns: - AsyncMock configured as aiohttp ClientSession with async context manager. - """ - session = AsyncMock(spec=aiohttp.ClientSession) - - # Configure post() to return context manager with response - post_cm = AsyncMock() - post_cm.__aenter__ = AsyncMock(return_value=mock_response) - post_cm.__aexit__ = AsyncMock(return_value=None) - session.post = MagicMock(return_value=post_cm) - session.closed = False - - return session - - -@pytest.fixture -def lsp_bridge() -> LspBridge: - """Create a fresh LspBridge instance for testing in VSCode Bridge mode. - - Returns: - LspBridge with use_vscode_bridge=True for HTTP-based tests. - """ - return LspBridge(use_vscode_bridge=True) - - -# ----------------------------------------------------------------------------- -# Location Tests -# ----------------------------------------------------------------------------- - - -class TestLocation: - """Tests for the Location dataclass.""" - - def test_to_dict(self): - """Location.to_dict() returns correct dictionary format.""" - loc = Location(file_path="/test/file.py", line=10, character=5) - result = loc.to_dict() - - assert result == { - "file_path": "/test/file.py", - "line": 10, - "character": 5, - } - - def test_from_lsp_response_with_range(self): - """Location.from_lsp_response() parses LSP range format correctly.""" - data = { - "uri": "file:///test/file.py", - "range": { - "start": {"line": 9, "character": 4}, # 0-based - "end": {"line": 15, "character": 0}, - }, - } - loc = Location.from_lsp_response(data) - - assert loc.file_path == "/test/file.py" - assert loc.line == 10 # Converted to 1-based - assert loc.character == 5 # Converted to 1-based - - def test_from_lsp_response_direct_fields(self): - """Location.from_lsp_response() handles direct line/character fields.""" - data = { - "file_path": "/direct/path.py", - "line": 25, - "character": 8, - } - loc = Location.from_lsp_response(data) - - assert loc.file_path == "/direct/path.py" - assert loc.line == 25 - assert loc.character == 8 - - -class TestLocationFromVscodeUri: - """Tests for parsing VSCode URI formats (P2 test case).""" - - @pytest.mark.parametrize( - "uri,expected_path", - [ - # Unix-style paths - ("file:///home/user/project/file.py", "/home/user/project/file.py"), - ("file:///usr/local/lib.py", "/usr/local/lib.py"), - # Windows-style paths - ("file:///C:/Users/dev/project/file.py", "C:/Users/dev/project/file.py"), - ("file:///D:/code/test.ts", "D:/code/test.ts"), - # Already plain path - ("/plain/path/file.py", "/plain/path/file.py"), - # Edge case: file:// without third slash - ("file://shared/network/file.py", "shared/network/file.py"), - ], - ) - def test_location_from_vscode_uri(self, uri: str, expected_path: str): - """Test correct parsing of various VSCode URI formats to OS paths. - - Verifies that file:///C:/path format on Windows and file:///path - format on Unix are correctly converted to native OS paths. - """ - data = { - "uri": uri, - "range": {"start": {"line": 0, "character": 0}, "end": {"line": 0, "character": 0}}, - } - loc = Location.from_lsp_response(data) - - assert loc.file_path == expected_path - - -# ----------------------------------------------------------------------------- -# P0 Critical Tests -# ----------------------------------------------------------------------------- - - -class TestGetReferencesSuccess: - """P0: Test successful get_references scenarios.""" - - @pytest.mark.asyncio - async def test_get_references_success( - self, - lsp_bridge: LspBridge, - sample_symbol: CodeSymbolNode, - mock_session: AsyncMock, - mock_response: AsyncMock, - ): - """Test get_references returns Location list and caches result. - - Mock session returns 200 OK with valid LSP location list. - Verifies: - - Returns list of Location objects - - Results are stored in cache - """ - # Setup mock response with valid locations - mock_response.status = 200 - mock_response.json = AsyncMock(return_value={ - "success": True, - "result": [ - { - "uri": "file:///ref1.py", - "range": {"start": {"line": 5, "character": 0}, "end": {"line": 5, "character": 10}}, - }, - { - "uri": "file:///ref2.py", - "range": {"start": {"line": 15, "character": 4}, "end": {"line": 15, "character": 14}}, - }, - ], - }) - - # Inject mock session - lsp_bridge._session = mock_session - - # Execute - with patch.object(lsp_bridge, "_get_file_mtime", return_value=1000.0): - refs = await lsp_bridge.get_references(sample_symbol) - - # Verify results - assert len(refs) == 2 - assert isinstance(refs[0], Location) - assert refs[0].file_path == "/ref1.py" - assert refs[0].line == 6 # 0-based to 1-based - assert refs[1].file_path == "/ref2.py" - assert refs[1].line == 16 - - # Verify cached - cache_key = f"refs:{sample_symbol.id}" - assert cache_key in lsp_bridge.cache - assert lsp_bridge.cache[cache_key].data == refs - - -class TestGetReferencesBridgeNotRunning: - """P0: Test get_references when bridge is not running.""" - - @pytest.mark.asyncio - async def test_get_references_bridge_not_running( - self, - lsp_bridge: LspBridge, - sample_symbol: CodeSymbolNode, - ): - """Test get_references returns empty list on ClientConnectorError. - - When VSCode Bridge is not running, aiohttp raises ClientConnectorError. - Verifies: - - Returns empty list [] - - No cache entry is created - """ - # Setup mock session that raises connection error - mock_session = AsyncMock(spec=aiohttp.ClientSession) - mock_session.closed = False - mock_session.post = MagicMock(side_effect=aiohttp.ClientConnectorError( - connection_key=MagicMock(), - os_error=OSError("Connection refused"), - )) - - lsp_bridge._session = mock_session - - # Execute - refs = await lsp_bridge.get_references(sample_symbol) - - # Verify - assert refs == [] - cache_key = f"refs:{sample_symbol.id}" - assert cache_key not in lsp_bridge.cache - - -class TestGetReferencesTimeout: - """P0: Test get_references timeout handling.""" - - @pytest.mark.asyncio - async def test_get_references_timeout( - self, - lsp_bridge: LspBridge, - sample_symbol: CodeSymbolNode, - ): - """Test get_references returns empty list on asyncio.TimeoutError. - - When request times out, should gracefully return empty list. - """ - # Setup mock session that raises timeout - mock_session = AsyncMock(spec=aiohttp.ClientSession) - mock_session.closed = False - - async def raise_timeout(*args, **kwargs): - raise asyncio.TimeoutError() - - post_cm = AsyncMock() - post_cm.__aenter__ = raise_timeout - post_cm.__aexit__ = AsyncMock(return_value=None) - mock_session.post = MagicMock(return_value=post_cm) - - lsp_bridge._session = mock_session - - # Execute - refs = await lsp_bridge.get_references(sample_symbol) - - # Verify - assert refs == [] - - -class TestCallHierarchyFallback: - """P0: Test call_hierarchy fallback to references.""" - - @pytest.mark.asyncio - async def test_call_hierarchy_fallback_to_references( - self, - lsp_bridge: LspBridge, - sample_symbol: CodeSymbolNode, - mock_session: AsyncMock, - ): - """Test get_call_hierarchy falls back to get_references when not supported. - - When call_hierarchy request returns None (not supported by language server), - verifies: - - Falls back to calling get_references - - Returns converted CallHierarchyItem list - """ - call_count = 0 - - async def mock_json(): - nonlocal call_count - call_count += 1 - if call_count == 1: - # First call is get_call_hierarchy - return failure - return {"success": False} - else: - # Second call is get_references - return valid refs - return { - "success": True, - "result": [ - { - "uri": "file:///caller.py", - "range": {"start": {"line": 10, "character": 5}, "end": {"line": 10, "character": 15}}, - }, - ], - } - - # Setup mock response - mock_response = AsyncMock() - mock_response.status = 200 - mock_response.json = mock_json - - post_cm = AsyncMock() - post_cm.__aenter__ = AsyncMock(return_value=mock_response) - post_cm.__aexit__ = AsyncMock(return_value=None) - mock_session.post = MagicMock(return_value=post_cm) - - lsp_bridge._session = mock_session - - # Execute - with patch.object(lsp_bridge, "_get_file_mtime", return_value=1000.0): - items = await lsp_bridge.get_call_hierarchy(sample_symbol) - - # Verify fallback occurred and returned CallHierarchyItem - assert len(items) == 1 - assert isinstance(items[0], CallHierarchyItem) - assert items[0].file_path == "/caller.py" - assert items[0].kind == "reference" - assert "Inferred from reference" in items[0].detail - - -# ----------------------------------------------------------------------------- -# P1 Important Tests -# ----------------------------------------------------------------------------- - - -class TestCacheHit: - """P1: Test cache hit behavior.""" - - @pytest.mark.asyncio - async def test_cache_hit( - self, - lsp_bridge: LspBridge, - sample_symbol: CodeSymbolNode, - mock_session: AsyncMock, - mock_response: AsyncMock, - ): - """Test that same symbol called twice only makes one request. - - Verifies: - - _request is only called once - - Second call returns cached result - """ - mock_response.status = 200 - mock_response.json = AsyncMock(return_value={ - "success": True, - "result": [ - {"uri": "file:///ref.py", "range": {"start": {"line": 0, "character": 0}, "end": {"line": 0, "character": 0}}}, - ], - }) - - lsp_bridge._session = mock_session - - with patch.object(lsp_bridge, "_get_file_mtime", return_value=1000.0): - # First call - should make request - refs1 = await lsp_bridge.get_references(sample_symbol) - - # Second call - should use cache - refs2 = await lsp_bridge.get_references(sample_symbol) - - # Verify only one HTTP call was made - assert mock_session.post.call_count == 1 - - # Verify both calls return same data - assert refs1 == refs2 - - -class TestCacheInvalidationTtl: - """P1: Test cache TTL invalidation.""" - - @pytest.mark.asyncio - async def test_cache_invalidation_ttl( - self, - sample_symbol: CodeSymbolNode, - mock_session: AsyncMock, - mock_response: AsyncMock, - ): - """Test cache entry expires after TTL. - - Sets extremely short TTL and verifies: - - Cache entry expires - - New request is made after TTL expires - """ - # Create bridge with very short TTL (VSCode Bridge mode for HTTP tests) - bridge = LspBridge(cache_ttl=1, use_vscode_bridge=True) # 1 second TTL - - mock_response.status = 200 - mock_response.json = AsyncMock(return_value={ - "success": True, - "result": [ - {"uri": "file:///ref.py", "range": {"start": {"line": 0, "character": 0}, "end": {"line": 0, "character": 0}}}, - ], - }) - - bridge._session = mock_session - - with patch.object(bridge, "_get_file_mtime", return_value=1000.0): - # First call - await bridge.get_references(sample_symbol) - assert mock_session.post.call_count == 1 - - # Wait for TTL to expire - await asyncio.sleep(1.1) - - # Second call - should make new request - await bridge.get_references(sample_symbol) - assert mock_session.post.call_count == 2 - - await bridge.close() - - -class TestCacheInvalidationFileModified: - """P1: Test cache invalidation on file modification.""" - - @pytest.mark.asyncio - async def test_cache_invalidation_file_modified( - self, - lsp_bridge: LspBridge, - sample_symbol: CodeSymbolNode, - mock_session: AsyncMock, - mock_response: AsyncMock, - ): - """Test cache entry invalidates when file mtime changes. - - Verifies: - - mtime change triggers cache invalidation - - New request is made after file modification - """ - mock_response.status = 200 - mock_response.json = AsyncMock(return_value={ - "success": True, - "result": [ - {"uri": "file:///ref.py", "range": {"start": {"line": 0, "character": 0}, "end": {"line": 0, "character": 0}}}, - ], - }) - - lsp_bridge._session = mock_session - - # Mock mtime: first call returns 1000.0, subsequent calls return 2000.0 - # This simulates file being modified between cache store and cache check - call_count = [0] - - def get_mtime(path: str) -> float: - call_count[0] += 1 - # First call during _cache() stores mtime 1000.0 - # Second call during _is_cached() should see different mtime - if call_count[0] <= 1: - return 1000.0 - return 2000.0 # File modified - - with patch.object(lsp_bridge, "_get_file_mtime", side_effect=get_mtime): - # First call - should make request and cache with mtime 1000.0 - await lsp_bridge.get_references(sample_symbol) - assert mock_session.post.call_count == 1 - - # Second call - mtime check returns 2000.0 (different from cached 1000.0) - # Should invalidate cache and make new request - await lsp_bridge.get_references(sample_symbol) - assert mock_session.post.call_count == 2 - - -# ----------------------------------------------------------------------------- -# P2 Supplementary Tests -# ----------------------------------------------------------------------------- - - -class TestResponseParsingInvalidJson: - """P2: Test handling of malformed JSON responses.""" - - @pytest.mark.asyncio - async def test_response_parsing_invalid_json( - self, - lsp_bridge: LspBridge, - sample_symbol: CodeSymbolNode, - mock_session: AsyncMock, - ): - """Test graceful handling of malformed JSON response. - - Verifies: - - Returns empty list when JSON parsing fails - - Does not raise exception - """ - # Setup mock to raise JSONDecodeError - mock_response = AsyncMock() - mock_response.status = 200 - mock_response.json = AsyncMock(side_effect=Exception("Invalid JSON")) - - post_cm = AsyncMock() - post_cm.__aenter__ = AsyncMock(return_value=mock_response) - post_cm.__aexit__ = AsyncMock(return_value=None) - mock_session.post = MagicMock(return_value=post_cm) - - lsp_bridge._session = mock_session - - # Execute - should not raise - refs = await lsp_bridge.get_references(sample_symbol) - - # Verify graceful handling - assert refs == [] - - @pytest.mark.asyncio - async def test_response_with_malformed_location_items( - self, - lsp_bridge: LspBridge, - sample_symbol: CodeSymbolNode, - mock_session: AsyncMock, - mock_response: AsyncMock, - ): - """Test handling of partially malformed location items. - - The source code catches KeyError and TypeError when parsing items. - Tests that items causing these specific exceptions are skipped while - valid items are returned. - """ - mock_response.status = 200 - mock_response.json = AsyncMock(return_value={ - "success": True, - "result": [ - # Valid item - {"uri": "file:///valid.py", "range": {"start": {"line": 0, "character": 0}, "end": {"line": 0, "character": 0}}}, - # Another valid item - {"uri": "file:///valid2.py", "range": {"start": {"line": 5, "character": 0}, "end": {"line": 5, "character": 0}}}, - ], - }) - - lsp_bridge._session = mock_session - - with patch.object(lsp_bridge, "_get_file_mtime", return_value=1000.0): - refs = await lsp_bridge.get_references(sample_symbol) - - # Should return both valid items - assert len(refs) == 2 - assert refs[0].file_path == "/valid.py" - assert refs[1].file_path == "/valid2.py" - - @pytest.mark.asyncio - async def test_response_with_empty_result_list( - self, - lsp_bridge: LspBridge, - sample_symbol: CodeSymbolNode, - mock_session: AsyncMock, - mock_response: AsyncMock, - ): - """Test handling of empty result list.""" - mock_response.status = 200 - mock_response.json = AsyncMock(return_value={ - "success": True, - "result": [], - }) - - lsp_bridge._session = mock_session - - with patch.object(lsp_bridge, "_get_file_mtime", return_value=1000.0): - refs = await lsp_bridge.get_references(sample_symbol) - - assert refs == [] - - -class TestLspBridgeContextManager: - """Test async context manager functionality (VSCode Bridge mode).""" - - @pytest.mark.asyncio - async def test_context_manager_closes_session(self): - """Test that async context manager properly closes session in VSCode Bridge mode.""" - async with LspBridge(use_vscode_bridge=True) as bridge: - # Create a session - session = await bridge._get_session() - assert session is not None - assert not session.closed - - # After context, session should be closed - assert bridge._session is None or bridge._session.closed - - -class TestCacheEntry: - """Test CacheEntry dataclass.""" - - def test_cache_entry_fields(self): - """CacheEntry stores all required fields.""" - entry = CacheEntry( - data=["some", "data"], - file_mtime=12345.0, - cached_at=time.time(), - ) - - assert entry.data == ["some", "data"] - assert entry.file_mtime == 12345.0 - assert entry.cached_at > 0 - - -class TestLspBridgeCacheLru: - """Test LRU cache behavior.""" - - def test_cache_lru_eviction(self): - """Test that oldest entries are evicted when at max capacity.""" - bridge = LspBridge(max_cache_size=3) - - # Add entries - bridge._cache("key1", "/file1.py", "data1") - bridge._cache("key2", "/file2.py", "data2") - bridge._cache("key3", "/file3.py", "data3") - - assert len(bridge.cache) == 3 - - # Add one more - should evict oldest (key1) - bridge._cache("key4", "/file4.py", "data4") - - assert len(bridge.cache) == 3 - assert "key1" not in bridge.cache - assert "key4" in bridge.cache - - def test_cache_access_moves_to_end(self): - """Test that accessing cached item moves it to end (LRU behavior).""" - bridge = LspBridge(max_cache_size=3) - - with patch.object(bridge, "_get_file_mtime", return_value=1000.0): - bridge._cache("key1", "/file.py", "data1") - bridge._cache("key2", "/file.py", "data2") - bridge._cache("key3", "/file.py", "data3") - - # Access key1 - should move it to end - bridge._is_cached("key1", "/file.py") - - # Add key4 - should evict key2 (now oldest) - bridge._cache("key4", "/file.py", "data4") - - assert "key1" in bridge.cache - assert "key2" not in bridge.cache - - -class TestGetHover: - """Test get_hover method.""" - - @pytest.mark.asyncio - async def test_get_hover_returns_string( - self, - lsp_bridge: LspBridge, - sample_symbol: CodeSymbolNode, - mock_session: AsyncMock, - mock_response: AsyncMock, - ): - """Test get_hover returns hover documentation string.""" - mock_response.status = 200 - mock_response.json = AsyncMock(return_value={ - "success": True, - "result": { - "contents": "Function documentation here", - }, - }) - - lsp_bridge._session = mock_session - - with patch.object(lsp_bridge, "_get_file_mtime", return_value=1000.0): - hover = await lsp_bridge.get_hover(sample_symbol) - - assert hover == "Function documentation here" - - @pytest.mark.asyncio - async def test_get_hover_handles_marked_string_list( - self, - lsp_bridge: LspBridge, - sample_symbol: CodeSymbolNode, - mock_session: AsyncMock, - mock_response: AsyncMock, - ): - """Test get_hover handles MarkedString list format.""" - mock_response.status = 200 - mock_response.json = AsyncMock(return_value={ - "success": True, - "result": [ - {"value": "```python\ndef func():\n```"}, - {"value": "Documentation text"}, - ], - }) - - lsp_bridge._session = mock_session - - with patch.object(lsp_bridge, "_get_file_mtime", return_value=1000.0): - hover = await lsp_bridge.get_hover(sample_symbol) - - assert "def func()" in hover - assert "Documentation text" in hover - - -class TestGetDefinition: - """Test get_definition method.""" - - @pytest.mark.asyncio - async def test_get_definition_returns_location( - self, - lsp_bridge: LspBridge, - sample_symbol: CodeSymbolNode, - mock_session: AsyncMock, - mock_response: AsyncMock, - ): - """Test get_definition returns Location for found definition.""" - mock_response.status = 200 - mock_response.json = AsyncMock(return_value={ - "success": True, - "result": [ - { - "uri": "file:///definition.py", - "range": {"start": {"line": 99, "character": 0}, "end": {"line": 110, "character": 0}}, - }, - ], - }) - - lsp_bridge._session = mock_session - - with patch.object(lsp_bridge, "_get_file_mtime", return_value=1000.0): - definition = await lsp_bridge.get_definition(sample_symbol) - - assert definition is not None - assert definition.file_path == "/definition.py" - assert definition.line == 100 # 0-based to 1-based - - @pytest.mark.asyncio - async def test_get_definition_returns_none_on_failure( - self, - lsp_bridge: LspBridge, - sample_symbol: CodeSymbolNode, - mock_session: AsyncMock, - mock_response: AsyncMock, - ): - """Test get_definition returns None when not found.""" - mock_response.status = 200 - mock_response.json = AsyncMock(return_value={ - "success": False, - }) - - lsp_bridge._session = mock_session - - definition = await lsp_bridge.get_definition(sample_symbol) - - assert definition is None - - -class TestGetDocumentSymbols: - """Test get_document_symbols method.""" - - @pytest.mark.asyncio - async def test_get_document_symbols_flattens_hierarchy( - self, - lsp_bridge: LspBridge, - mock_session: AsyncMock, - mock_response: AsyncMock, - ): - """Test get_document_symbols flattens nested symbol hierarchy.""" - mock_response.status = 200 - mock_response.json = AsyncMock(return_value={ - "success": True, - "result": [ - { - "name": "MyClass", - "kind": 5, # Class - "range": {"start": {"line": 0, "character": 0}, "end": {"line": 20, "character": 0}}, - "children": [ - { - "name": "my_method", - "kind": 6, # Method - "range": {"start": {"line": 5, "character": 4}, "end": {"line": 10, "character": 4}}, - }, - ], - }, - ], - }) - - lsp_bridge._session = mock_session - - with patch.object(lsp_bridge, "_get_file_mtime", return_value=1000.0): - symbols = await lsp_bridge.get_document_symbols("/test/file.py") - - # Should have both class and method - assert len(symbols) == 2 - assert symbols[0]["name"] == "MyClass" - assert symbols[0]["kind"] == "class" - assert symbols[1]["name"] == "my_method" - assert symbols[1]["kind"] == "method" - assert symbols[1]["parent"] == "MyClass" - - -class TestSymbolKindConversion: - """Test symbol kind integer to string conversion.""" - - @pytest.mark.parametrize( - "kind_int,expected_str", - [ - (1, "file"), - (5, "class"), - (6, "method"), - (12, "function"), - (13, "variable"), - (999, "unknown"), # Unknown kind - ], - ) - def test_symbol_kind_to_string(self, kind_int: int, expected_str: str): - """Test _symbol_kind_to_string converts LSP SymbolKind correctly.""" - bridge = LspBridge() - result = bridge._symbol_kind_to_string(kind_int) - assert result == expected_str - - -class TestClearCache: - """Test cache clearing functionality.""" - - def test_clear_cache(self, lsp_bridge: LspBridge): - """Test clear_cache removes all entries.""" - # Add some cache entries - lsp_bridge._cache("key1", "/file.py", "data1") - lsp_bridge._cache("key2", "/file.py", "data2") - - assert len(lsp_bridge.cache) == 2 - - # Clear - lsp_bridge.clear_cache() - - assert len(lsp_bridge.cache) == 0 diff --git a/codex-lens/tests/unit/lsp/test_lsp_edge_cases.py b/codex-lens/tests/unit/lsp/test_lsp_edge_cases.py deleted file mode 100644 index 3f0cd4b0..00000000 --- a/codex-lens/tests/unit/lsp/test_lsp_edge_cases.py +++ /dev/null @@ -1,795 +0,0 @@ -"""Edge case and exception tests for LSP Bridge and Graph Builder. - -This module tests boundary conditions, error handling, and exceptional -scenarios in the LSP communication and graph building components. - -Test Categories: -- P1 (Critical): Empty responses, HTTP errors -- P2 (Important): Edge inputs, deep structures, special characters -- P3 (Nice-to-have): Cache eviction, concurrent access, circular refs - -Note: Tests for HTTP-based communication use use_vscode_bridge=True mode. -""" - -from __future__ import annotations - -import asyncio -from typing import Any, Dict, List -from unittest.mock import AsyncMock, MagicMock, patch - -import pytest - -from codexlens.hybrid_search.data_structures import ( - CodeAssociationGraph, - CodeSymbolNode, - Range, -) - - -# --------------------------------------------------------------------------- -# Fixtures -# --------------------------------------------------------------------------- - -@pytest.fixture -def valid_range() -> Range: - """Create a valid Range for test symbols.""" - return Range( - start_line=10, - start_character=0, - end_line=20, - end_character=0, - ) - - -@pytest.fixture -def sample_symbol(valid_range: Range) -> CodeSymbolNode: - """Create a sample CodeSymbolNode for testing.""" - return CodeSymbolNode( - id="test/file.py:test_func:10", - name="test_func", - kind="function", - file_path="test/file.py", - range=valid_range, - ) - - -@pytest.fixture -def symbol_with_empty_path() -> CodeSymbolNode: - """Create a CodeSymbolNode with empty file_path. - - Note: CodeSymbolNode.__post_init__ validates that file_path cannot be empty, - so this fixture tests the case where validation is bypassed or data comes - from external sources that might have empty paths. - """ - # We need to bypass validation for this edge case test - node = object.__new__(CodeSymbolNode) - node.id = "::0" - node.name = "empty" - node.kind = "unknown" - node.file_path = "" # Empty path - edge case - node.range = Range(start_line=0, start_character=0, end_line=0, end_character=0) - node.embedding = None - node.raw_code = "" - node.docstring = "" - node.score = 0.0 - return node - - -@pytest.fixture -def mock_aiohttp_session(): - """Create a mock aiohttp ClientSession.""" - session = AsyncMock() - return session - - -@pytest.fixture -def mock_error_response(): - """Create a mock aiohttp response with HTTP 500 error.""" - response = AsyncMock() - response.status = 500 - response.json = AsyncMock(return_value={"error": "Internal Server Error"}) - return response - - -@pytest.fixture -def mock_empty_response(): - """Create a mock aiohttp response returning empty list.""" - response = AsyncMock() - response.status = 200 - response.json = AsyncMock(return_value={"success": True, "result": []}) - return response - - -# --------------------------------------------------------------------------- -# P1 Tests - Critical Edge Cases -# --------------------------------------------------------------------------- - -class TestLspReturnsEmptyList: - """Test handling when LSP returns empty results. - - Module: LspGraphBuilder._expand_node - Mock: LspBridge methods return [] - Assert: Node marked as visited, no new nodes/edges added, returns [] - """ - - @pytest.mark.asyncio - async def test_expand_node_with_empty_references(self, sample_symbol: CodeSymbolNode): - """When LSP returns empty references, node should be visited but no expansion.""" - from codexlens.lsp.lsp_graph_builder import LspGraphBuilder - - # Create mock LspBridge that returns empty results - mock_bridge = AsyncMock() - mock_bridge.get_references = AsyncMock(return_value=[]) - mock_bridge.get_call_hierarchy = AsyncMock(return_value=[]) - - builder = LspGraphBuilder(max_depth=2, max_nodes=100) - graph = CodeAssociationGraph() - graph.add_node(sample_symbol) - visited = set() - semaphore = asyncio.Semaphore(10) - - # Expand the node - result = await builder._expand_node( - sample_symbol, - depth=0, - graph=graph, - lsp_bridge=mock_bridge, - visited=visited, - semaphore=semaphore, - ) - - # Assertions - assert sample_symbol.id in visited # Node should be marked as visited - assert result == [] # No new nodes to process - assert len(graph.nodes) == 1 # Only the original seed node - assert len(graph.edges) == 0 # No edges added - - @pytest.mark.asyncio - async def test_build_from_seeds_with_empty_lsp_results(self, sample_symbol: CodeSymbolNode): - """When LSP returns empty for all queries, graph should contain only seeds.""" - from codexlens.lsp.lsp_graph_builder import LspGraphBuilder - - mock_bridge = AsyncMock() - mock_bridge.get_references = AsyncMock(return_value=[]) - mock_bridge.get_call_hierarchy = AsyncMock(return_value=[]) - mock_bridge.get_document_symbols = AsyncMock(return_value=[]) - - builder = LspGraphBuilder(max_depth=2, max_nodes=100) - - # Build graph from seed - graph = await builder.build_from_seeds([sample_symbol], mock_bridge) - - # Should only have the seed node - assert len(graph.nodes) == 1 - assert sample_symbol.id in graph.nodes - assert len(graph.edges) == 0 - - @pytest.mark.asyncio - async def test_already_visited_node_returns_empty(self, sample_symbol: CodeSymbolNode): - """Attempting to expand an already-visited node should return empty immediately.""" - from codexlens.lsp.lsp_graph_builder import LspGraphBuilder - - mock_bridge = AsyncMock() - # These should not be called since node is already visited - mock_bridge.get_references = AsyncMock(return_value=[]) - mock_bridge.get_call_hierarchy = AsyncMock(return_value=[]) - - builder = LspGraphBuilder() - graph = CodeAssociationGraph() - graph.add_node(sample_symbol) - visited = {sample_symbol.id} # Already visited - semaphore = asyncio.Semaphore(10) - - result = await builder._expand_node( - sample_symbol, - depth=0, - graph=graph, - lsp_bridge=mock_bridge, - visited=visited, - semaphore=semaphore, - ) - - assert result == [] - # Bridge methods should not have been called - mock_bridge.get_references.assert_not_called() - mock_bridge.get_call_hierarchy.assert_not_called() - - -class TestLspHttpError500: - """Test handling of HTTP 500 errors from LSP bridge (VSCode Bridge mode). - - Module: LspBridge._request_vscode_bridge - Mock: aiohttp response status=500 - Assert: Returns None, caller handles as failure - """ - - @pytest.mark.asyncio - async def test_request_returns_none_on_500(self): - """HTTP 500 response should result in None return value.""" - from codexlens.lsp.lsp_bridge import LspBridge - - # Create bridge in VSCode Bridge mode with mocked session - bridge = LspBridge(use_vscode_bridge=True) - - # Mock the session to return 500 error - mock_response = AsyncMock() - mock_response.status = 500 - mock_response.__aenter__ = AsyncMock(return_value=mock_response) - mock_response.__aexit__ = AsyncMock(return_value=None) - - mock_session = AsyncMock() - mock_session.post = MagicMock(return_value=mock_response) - - with patch.object(bridge, '_get_session', return_value=mock_session): - result = await bridge._request_vscode_bridge("get_references", {"file_path": "test.py"}) - - assert result is None - - @pytest.mark.asyncio - async def test_get_references_returns_empty_on_500(self, sample_symbol: CodeSymbolNode): - """get_references should return empty list on HTTP 500.""" - from codexlens.lsp.lsp_bridge import LspBridge - - bridge = LspBridge(use_vscode_bridge=True) - - # Mock _request_vscode_bridge to return None (simulating HTTP error) - with patch.object(bridge, '_request_vscode_bridge', return_value=None): - result = await bridge.get_references(sample_symbol) - - assert result == [] - - @pytest.mark.asyncio - async def test_get_definition_returns_none_on_500(self, sample_symbol: CodeSymbolNode): - """get_definition should return None on HTTP 500.""" - from codexlens.lsp.lsp_bridge import LspBridge - - bridge = LspBridge(use_vscode_bridge=True) - - with patch.object(bridge, '_request_vscode_bridge', return_value=None): - result = await bridge.get_definition(sample_symbol) - - assert result is None - - @pytest.mark.asyncio - async def test_get_hover_returns_none_on_500(self, sample_symbol: CodeSymbolNode): - """get_hover should return None on HTTP 500.""" - from codexlens.lsp.lsp_bridge import LspBridge - - bridge = LspBridge(use_vscode_bridge=True) - - with patch.object(bridge, '_request_vscode_bridge', return_value=None): - result = await bridge.get_hover(sample_symbol) - - assert result is None - - @pytest.mark.asyncio - async def test_graph_builder_handles_lsp_errors_gracefully(self, sample_symbol: CodeSymbolNode): - """Graph builder should handle LSP errors without crashing.""" - from codexlens.lsp.lsp_graph_builder import LspGraphBuilder - - mock_bridge = AsyncMock() - # Simulate exceptions from LSP - mock_bridge.get_references = AsyncMock(side_effect=Exception("LSP Error")) - mock_bridge.get_call_hierarchy = AsyncMock(side_effect=Exception("LSP Error")) - - builder = LspGraphBuilder() - - # Should not raise, should return graph with just the seed - graph = await builder.build_from_seeds([sample_symbol], mock_bridge) - - assert len(graph.nodes) == 1 - assert sample_symbol.id in graph.nodes - - -# --------------------------------------------------------------------------- -# P2 Tests - Important Edge Cases -# --------------------------------------------------------------------------- - -class TestSymbolWithEmptyFilePath: - """Test handling of symbols with empty file_path (VSCode Bridge mode). - - Module: LspBridge.get_references - Input: CodeSymbolNode with file_path="" - Assert: Does not send request, returns [] early - """ - - @pytest.mark.asyncio - async def test_get_references_with_empty_path_symbol(self, symbol_with_empty_path: CodeSymbolNode): - """get_references with empty file_path should handle gracefully.""" - from codexlens.lsp.lsp_bridge import LspBridge - - bridge = LspBridge(use_vscode_bridge=True) - - # Mock _request_vscode_bridge - it should still work but with empty path - mock_result = [] - with patch.object(bridge, '_request_vscode_bridge', return_value=mock_result) as mock_req: - result = await bridge.get_references(symbol_with_empty_path) - - # Should return empty list - assert result == [] - # The request was still made (current implementation doesn't pre-validate) - # This documents current behavior - might want to add validation - - @pytest.mark.asyncio - async def test_cache_with_empty_path_symbol(self, symbol_with_empty_path: CodeSymbolNode): - """Cache operations with empty file_path should not crash.""" - from codexlens.lsp.lsp_bridge import LspBridge - - bridge = LspBridge() - - # Cache should handle empty path (mtime check returns 0.0) - cache_key = f"refs:{symbol_with_empty_path.id}" - bridge._cache(cache_key, "", []) # Empty path - - # Should be able to check cache without crashing - is_cached = bridge._is_cached(cache_key, "") - # Note: May or may not be cached depending on mtime behavior - assert isinstance(is_cached, bool) - - -class TestVeryDeepGraphStructure: - """Test graph building with very deep reference chains. - - Module: LspGraphBuilder.build_from_seeds - Input: max_depth=10 - Mock: LspBridge produces long chain of references - Assert: Expansion stops cleanly at max_depth - """ - - @pytest.mark.asyncio - async def test_expansion_stops_at_max_depth(self, valid_range: Range): - """Graph expansion should stop at max_depth.""" - from codexlens.lsp.lsp_bridge import Location - from codexlens.lsp.lsp_graph_builder import LspGraphBuilder - - # Create a chain of symbols: seed -> ref1 -> ref2 -> ... -> refN - max_depth = 3 # Use small depth for testing - - def create_mock_refs(symbol: CodeSymbolNode) -> List[Location]: - """Create a single reference pointing to next in chain.""" - depth = int(symbol.id.split(":")[-1]) # Extract depth from ID - if depth >= max_depth + 5: # Chain goes deeper than max_depth - return [] - next_depth = depth + 1 - return [Location( - file_path=f"test/file_{next_depth}.py", - line=1, - character=0, - )] - - mock_bridge = AsyncMock() - mock_bridge.get_references = AsyncMock(side_effect=lambda s: create_mock_refs(s)) - mock_bridge.get_call_hierarchy = AsyncMock(return_value=[]) - mock_bridge.get_document_symbols = AsyncMock(return_value=[]) - - # Seed at depth 0 - seed = CodeSymbolNode( - id="test/file_0.py:seed:0", - name="seed", - kind="function", - file_path="test/file_0.py", - range=valid_range, - ) - - builder = LspGraphBuilder(max_depth=max_depth, max_nodes=100) - graph = await builder.build_from_seeds([seed], mock_bridge) - - # Graph should not exceed max_depth + 1 nodes (seed + max_depth levels) - # Actual count depends on how references are resolved - assert len(graph.nodes) <= max_depth + 2 # Some tolerance for edge cases - - @pytest.mark.asyncio - async def test_expansion_stops_at_max_nodes(self, valid_range: Range): - """Graph expansion should stop when max_nodes is reached.""" - from codexlens.lsp.lsp_bridge import Location - from codexlens.lsp.lsp_graph_builder import LspGraphBuilder - - call_count = [0] - - def create_many_refs(symbol: CodeSymbolNode) -> List[Location]: - """Create multiple references to generate many nodes.""" - call_count[0] += 1 - # Return multiple refs to rapidly grow the graph - return [ - Location(file_path=f"test/ref_{call_count[0]}_{i}.py", line=1, character=0) - for i in range(5) - ] - - mock_bridge = AsyncMock() - mock_bridge.get_references = AsyncMock(side_effect=create_many_refs) - mock_bridge.get_call_hierarchy = AsyncMock(return_value=[]) - mock_bridge.get_document_symbols = AsyncMock(return_value=[]) - - seed = CodeSymbolNode( - id="test/seed.py:seed:0", - name="seed", - kind="function", - file_path="test/seed.py", - range=valid_range, - ) - - max_nodes = 10 - builder = LspGraphBuilder(max_depth=100, max_nodes=max_nodes) # High depth, low nodes - graph = await builder.build_from_seeds([seed], mock_bridge) - - # Graph should not exceed max_nodes - assert len(graph.nodes) <= max_nodes - - -class TestNodeIdWithSpecialCharacters: - """Test node ID creation with special characters. - - Module: LspGraphBuilder._create_node_id - Input: file_path="a/b/c", name="", line=10 - Assert: ID successfully created as "a/b/c::10" - """ - - def test_create_node_id_with_special_name(self): - """Node ID should handle special characters in name.""" - from codexlens.lsp.lsp_graph_builder import LspGraphBuilder - - builder = LspGraphBuilder() - - # Test with angle brackets (common in Java/Kotlin constructors) - node_id = builder._create_node_id("a/b/c", "", 10) - assert node_id == "a/b/c::10" - - # Test with other special characters - node_id = builder._create_node_id("src/file.py", "__init__", 1) - assert node_id == "src/file.py:__init__:1" - - # Test with spaces (should preserve as-is) - node_id = builder._create_node_id("my path/file.ts", "my func", 5) - assert node_id == "my path/file.ts:my func:5" - - def test_create_node_id_with_windows_path(self): - """Node ID should handle Windows-style paths.""" - from codexlens.lsp.lsp_graph_builder import LspGraphBuilder - - builder = LspGraphBuilder() - - # Windows path with backslashes - node_id = builder._create_node_id("C:\\Users\\test\\file.py", "main", 1) - assert "main" in node_id - assert "1" in node_id - - def test_create_node_id_with_unicode(self): - """Node ID should handle unicode characters.""" - from codexlens.lsp.lsp_graph_builder import LspGraphBuilder - - builder = LspGraphBuilder() - - # Unicode in name - node_id = builder._create_node_id("src/file.py", "func_name", 10) - assert node_id == "src/file.py:func_name:10" - - def test_code_symbol_node_id_format(self): - """CodeSymbolNode.create_id should match LspGraphBuilder format.""" - from codexlens.lsp.lsp_graph_builder import LspGraphBuilder - - builder = LspGraphBuilder() - - # Both should produce the same format - builder_id = builder._create_node_id("path/file.py", "func", 10) - symbol_id = CodeSymbolNode.create_id("path/file.py", "func", 10) - - assert builder_id == symbol_id - - -# --------------------------------------------------------------------------- -# P3 Tests - Additional Edge Cases (if time allows) -# --------------------------------------------------------------------------- - -class TestCacheLruEviction: - """Test LRU cache eviction behavior. - - Module: LspBridge._cache - Input: max_cache_size=3, add 5 entries - Assert: Only most recent 3 entries remain - """ - - def test_cache_evicts_oldest_entries(self): - """Cache should evict oldest entries when at capacity.""" - from codexlens.lsp.lsp_bridge import LspBridge - - bridge = LspBridge(max_cache_size=3) - - # Add 5 entries (exceeds max of 3) - for i in range(5): - bridge._cache(f"key_{i}", "test.py", f"data_{i}") - - # Should only have 3 entries - assert len(bridge.cache) == 3 - - # Oldest entries (key_0, key_1) should be evicted - assert "key_0" not in bridge.cache - assert "key_1" not in bridge.cache - - # Newest entries should remain - assert "key_2" in bridge.cache - assert "key_3" in bridge.cache - assert "key_4" in bridge.cache - - def test_cache_moves_accessed_entry_to_end(self): - """Accessing a cached entry should move it to end (LRU behavior).""" - from codexlens.lsp.lsp_bridge import LspBridge - - bridge = LspBridge(max_cache_size=3) - - # Add 3 entries - bridge._cache("key_0", "test.py", "data_0") - bridge._cache("key_1", "test.py", "data_1") - bridge._cache("key_2", "test.py", "data_2") - - # Access key_0 (should move to end) - with patch.object(bridge, '_get_file_mtime', return_value=0.0): - bridge._is_cached("key_0", "test.py") - - # Add new entry - key_1 should be evicted (was least recently used) - bridge._cache("key_3", "test.py", "data_3") - - assert len(bridge.cache) == 3 - assert "key_0" in bridge.cache # Was accessed, moved to end - assert "key_1" not in bridge.cache # Was evicted - assert "key_2" in bridge.cache - assert "key_3" in bridge.cache - - -class TestConcurrentCacheAccess: - """Test thread-safety of cache operations. - - Module: LspBridge - Test: Multiple concurrent requests access/update cache - Assert: No race conditions, cache remains consistent - """ - - @pytest.mark.asyncio - async def test_concurrent_cache_operations(self, valid_range: Range): - """Multiple concurrent requests should not corrupt cache.""" - from codexlens.lsp.lsp_bridge import LspBridge - - bridge = LspBridge(max_cache_size=100) - - async def cache_operation(i: int) -> None: - """Simulate a cache read/write operation.""" - key = f"key_{i % 10}" # Reuse keys to create contention - file_path = f"file_{i}.py" - - # Check cache - bridge._is_cached(key, file_path) - - # Small delay to increase contention likelihood - await asyncio.sleep(0.001) - - # Write to cache - bridge._cache(key, file_path, f"data_{i}") - - # Run many concurrent operations - tasks = [cache_operation(i) for i in range(50)] - await asyncio.gather(*tasks) - - # Cache should be in consistent state - assert len(bridge.cache) <= bridge.max_cache_size - - # All entries should be valid CacheEntry objects - for key, entry in bridge.cache.items(): - assert hasattr(entry, 'data') - assert hasattr(entry, 'cached_at') - assert hasattr(entry, 'file_mtime') - - -class TestGraphWithCircularReferences: - """Test graph handling of circular reference patterns. - - Module: LspGraphBuilder - Mock: A -> B -> C -> A circular reference - Assert: visited set prevents infinite loop - """ - - @pytest.mark.asyncio - async def test_circular_references_do_not_loop_infinitely(self, valid_range: Range): - """Circular references should not cause infinite loops.""" - from codexlens.lsp.lsp_bridge import Location - from codexlens.lsp.lsp_graph_builder import LspGraphBuilder - - # Create circular reference pattern: A -> B -> C -> A - symbol_a = CodeSymbolNode( - id="file.py:A:1", name="A", kind="function", - file_path="file.py", range=valid_range, - ) - symbol_b = CodeSymbolNode( - id="file.py:B:10", name="B", kind="function", - file_path="file.py", range=valid_range, - ) - symbol_c = CodeSymbolNode( - id="file.py:C:20", name="C", kind="function", - file_path="file.py", range=valid_range, - ) - - ref_map = { - "file.py:A:1": [Location(file_path="file.py", line=10, character=0)], # A -> B - "file.py:B:10": [Location(file_path="file.py", line=20, character=0)], # B -> C - "file.py:C:20": [Location(file_path="file.py", line=1, character=0)], # C -> A (circular) - } - - def get_refs(symbol: CodeSymbolNode) -> List[Location]: - return ref_map.get(symbol.id, []) - - mock_bridge = AsyncMock() - mock_bridge.get_references = AsyncMock(side_effect=get_refs) - mock_bridge.get_call_hierarchy = AsyncMock(return_value=[]) - mock_bridge.get_document_symbols = AsyncMock(return_value=[ - {"name": "A", "kind": 12, "range": {"start": {"line": 0}, "end": {"line": 5}}}, - {"name": "B", "kind": 12, "range": {"start": {"line": 9}, "end": {"line": 15}}}, - {"name": "C", "kind": 12, "range": {"start": {"line": 19}, "end": {"line": 25}}}, - ]) - - builder = LspGraphBuilder(max_depth=10, max_nodes=100) - - # This should complete without hanging - graph = await asyncio.wait_for( - builder.build_from_seeds([symbol_a], mock_bridge), - timeout=5.0 # Should complete quickly, timeout is just safety - ) - - # Graph should contain the nodes without duplicates - assert len(graph.nodes) >= 1 # At least the seed - # No infinite loop occurred (we reached this point) - - -class TestRequestTimeoutHandling: - """Test timeout handling in LSP requests (VSCode Bridge mode).""" - - @pytest.mark.asyncio - async def test_timeout_returns_none(self, sample_symbol: CodeSymbolNode): - """Request timeout should return None gracefully.""" - from codexlens.lsp.lsp_bridge import LspBridge - - bridge = LspBridge(timeout=0.001, use_vscode_bridge=True) # Very short timeout - - # Mock session to raise TimeoutError - mock_response = AsyncMock() - mock_response.__aenter__ = AsyncMock(side_effect=asyncio.TimeoutError()) - mock_response.__aexit__ = AsyncMock(return_value=None) - - mock_session = AsyncMock() - mock_session.post = MagicMock(return_value=mock_response) - - with patch.object(bridge, '_get_session', return_value=mock_session): - result = await bridge._request_vscode_bridge("get_references", {}) - - assert result is None - - -class TestConnectionRefusedHandling: - """Test handling when VSCode Bridge is not running.""" - - @pytest.mark.asyncio - async def test_connection_refused_returns_none(self): - """Connection refused should return None gracefully.""" - pytest.importorskip("aiohttp") - import aiohttp - from codexlens.lsp.lsp_bridge import LspBridge - - bridge = LspBridge(use_vscode_bridge=True) - - # Mock session to raise ClientConnectorError - mock_session = AsyncMock() - mock_session.post = MagicMock( - side_effect=aiohttp.ClientConnectorError( - MagicMock(), OSError("Connection refused") - ) - ) - - with patch.object(bridge, '_get_session', return_value=mock_session): - result = await bridge._request_vscode_bridge("get_references", {}) - - assert result is None - - -class TestInvalidLspResponses: - """Test handling of malformed LSP responses (VSCode Bridge mode).""" - - @pytest.mark.asyncio - async def test_malformed_json_response(self, sample_symbol: CodeSymbolNode): - """Malformed response should be handled gracefully.""" - from codexlens.lsp.lsp_bridge import LspBridge - - bridge = LspBridge(use_vscode_bridge=True) - - # Response without expected structure - with patch.object(bridge, '_request_vscode_bridge', return_value={"unexpected": "structure"}): - result = await bridge.get_references(sample_symbol) - - # Should return empty list, not crash - assert result == [] - - @pytest.mark.asyncio - async def test_null_result_in_response(self, sample_symbol: CodeSymbolNode): - """Null/None result should be handled gracefully.""" - from codexlens.lsp.lsp_bridge import LspBridge - - bridge = LspBridge(use_vscode_bridge=True) - - with patch.object(bridge, '_request_vscode_bridge', return_value=None): - refs = await bridge.get_references(sample_symbol) - defn = await bridge.get_definition(sample_symbol) - hover = await bridge.get_hover(sample_symbol) - - assert refs == [] - assert defn is None - assert hover is None - - -class TestLocationParsing: - """Test Location parsing from various LSP response formats.""" - - def test_location_from_file_uri_unix(self): - """Parse Location from Unix-style file:// URI.""" - from codexlens.lsp.lsp_bridge import Location - - data = { - "uri": "file:///home/user/project/file.py", - "range": { - "start": {"line": 9, "character": 4}, - "end": {"line": 9, "character": 10}, - } - } - - loc = Location.from_lsp_response(data) - - assert loc.file_path == "/home/user/project/file.py" - assert loc.line == 10 # Converted from 0-based to 1-based - assert loc.character == 5 - - def test_location_from_file_uri_windows(self): - """Parse Location from Windows-style file:// URI.""" - from codexlens.lsp.lsp_bridge import Location - - data = { - "uri": "file:///C:/Users/test/project/file.py", - "range": { - "start": {"line": 0, "character": 0}, - "end": {"line": 0, "character": 5}, - } - } - - loc = Location.from_lsp_response(data) - - assert loc.file_path == "C:/Users/test/project/file.py" - assert loc.line == 1 - assert loc.character == 1 - - def test_location_from_file_uri_windows_percent_encoded_drive(self): - """Parse Location from percent-encoded Windows drive URIs (pyright-style).""" - from codexlens.lsp.lsp_bridge import Location - - data = { - "uri": "file:///d%3A/Claude_dms3/codex-lens/src/codexlens/api/semantic.py", - "range": { - "start": {"line": 18, "character": 3}, - "end": {"line": 18, "character": 10}, - }, - } - - loc = Location.from_lsp_response(data) - - assert loc.file_path == "d:/Claude_dms3/codex-lens/src/codexlens/api/semantic.py" - assert loc.line == 19 # 0-based -> 1-based - assert loc.character == 4 - - def test_location_from_direct_fields(self): - """Parse Location from direct field format.""" - from codexlens.lsp.lsp_bridge import Location - - data = { - "file_path": "/path/to/file.py", - "line": 5, - "character": 10, - } - - loc = Location.from_lsp_response(data) - - assert loc.file_path == "/path/to/file.py" - assert loc.line == 5 - assert loc.character == 10 diff --git a/codex-lens/tests/unit/lsp/test_lsp_graph_builder.py b/codex-lens/tests/unit/lsp/test_lsp_graph_builder.py deleted file mode 100644 index 799fde85..00000000 --- a/codex-lens/tests/unit/lsp/test_lsp_graph_builder.py +++ /dev/null @@ -1,549 +0,0 @@ -"""Unit tests for LspGraphBuilder. - -This module tests the LspGraphBuilder class responsible for building -code association graphs by BFS expansion from seed symbols using LSP. -""" - -from __future__ import annotations - -import asyncio -from typing import Any, Dict, List -from unittest.mock import AsyncMock, MagicMock, patch - -import pytest - -from codexlens.hybrid_search.data_structures import ( - CallHierarchyItem, - CodeAssociationGraph, - CodeSymbolNode, - Range, -) -from codexlens.lsp.lsp_bridge import Location, LspBridge -from codexlens.lsp.lsp_graph_builder import LspGraphBuilder - - -@pytest.fixture -def mock_lsp_bridge() -> AsyncMock: - """Create a mock LspBridge with async methods.""" - bridge = AsyncMock(spec=LspBridge) - bridge.get_references = AsyncMock(return_value=[]) - bridge.get_call_hierarchy = AsyncMock(return_value=[]) - bridge.get_document_symbols = AsyncMock(return_value=[]) - return bridge - - -@pytest.fixture -def seed_nodes() -> List[CodeSymbolNode]: - """Create seed nodes for testing.""" - return [ - CodeSymbolNode( - id="main.py:main:1", - name="main", - kind="function", - file_path="main.py", - range=Range( - start_line=1, - start_character=0, - end_line=10, - end_character=0, - ), - ) - ] - - -@pytest.fixture -def reference_location() -> Location: - """Create a reference location for testing.""" - return Location( - file_path="utils.py", - line=5, - character=10, - ) - - -@pytest.fixture -def call_hierarchy_item() -> CallHierarchyItem: - """Create a call hierarchy item for testing.""" - return CallHierarchyItem( - name="caller_func", - kind="function", - file_path="caller.py", - range=Range( - start_line=20, - start_character=0, - end_line=30, - end_character=0, - ), - detail="Calls main()", - ) - - -class TestSingleLevelGraphExpansion: - """P0: Test single level graph expansion with max_depth=1.""" - - @pytest.mark.asyncio - async def test_single_level_graph_expansion( - self, - mock_lsp_bridge: AsyncMock, - seed_nodes: List[CodeSymbolNode], - reference_location: Location, - call_hierarchy_item: CallHierarchyItem, - ) -> None: - """Test BFS expansion at depth 1 produces correct graph structure. - - Input: max_depth=1, single seed node - Mock: LspBridge returns 1 reference + 1 incoming call for seed only - Assert: Graph contains 3 nodes (seed, ref, call) and 2 edges from seed - """ - call_count = {"refs": 0, "calls": 0} - - async def mock_get_references(node: CodeSymbolNode) -> List[Location]: - """Return references only for the seed node.""" - call_count["refs"] += 1 - if node.file_path == "main.py": - return [reference_location] - return [] # No references for expanded nodes - - async def mock_get_call_hierarchy(node: CodeSymbolNode) -> List[CallHierarchyItem]: - """Return call hierarchy only for the seed node.""" - call_count["calls"] += 1 - if node.file_path == "main.py": - return [call_hierarchy_item] - return [] # No call hierarchy for expanded nodes - - mock_lsp_bridge.get_references.side_effect = mock_get_references - mock_lsp_bridge.get_call_hierarchy.side_effect = mock_get_call_hierarchy - - # Mock document symbols to provide symbol info for locations - mock_lsp_bridge.get_document_symbols.return_value = [ - { - "name": "helper_func", - "kind": 12, # function - "range": { - "start": {"line": 4, "character": 0}, - "end": {"line": 10, "character": 0}, - }, - } - ] - - builder = LspGraphBuilder(max_depth=1, max_nodes=100, max_concurrent=10) - graph = await builder.build_from_seeds(seed_nodes, mock_lsp_bridge) - - # Verify graph structure - assert len(graph.nodes) == 3, f"Expected 3 nodes, got {len(graph.nodes)}: {list(graph.nodes.keys())}" - assert len(graph.edges) == 2, f"Expected 2 edges, got {len(graph.edges)}: {graph.edges}" - - # Verify seed node is present - assert "main.py:main:1" in graph.nodes - - # Verify edges exist with correct relationship types - edge_types = [edge[2] for edge in graph.edges] - assert "references" in edge_types, "Expected 'references' edge" - assert "calls" in edge_types, "Expected 'calls' edge" - - # Verify expansion was called for seed and expanded nodes - # (nodes at depth 1 should not be expanded beyond max_depth=1) - assert call_count["refs"] >= 1, "get_references should be called at least once" - - -class TestMaxNodesBoundary: - """P0: Test max_nodes boundary stops expansion.""" - - @pytest.mark.asyncio - async def test_max_nodes_boundary( - self, - mock_lsp_bridge: AsyncMock, - seed_nodes: List[CodeSymbolNode], - ) -> None: - """Test graph expansion stops when max_nodes is reached. - - Input: max_nodes=5 - Mock: LspBridge returns many references - Assert: Graph expansion stops at 5 nodes - """ - # Create many reference locations - many_refs = [ - Location(file_path=f"file{i}.py", line=i, character=0) - for i in range(20) - ] - mock_lsp_bridge.get_references.return_value = many_refs - mock_lsp_bridge.get_call_hierarchy.return_value = [] - mock_lsp_bridge.get_document_symbols.return_value = [] - - builder = LspGraphBuilder(max_depth=10, max_nodes=5, max_concurrent=10) - graph = await builder.build_from_seeds(seed_nodes, mock_lsp_bridge) - - # Verify node count does not exceed max_nodes - assert len(graph.nodes) <= 5, ( - f"Expected at most 5 nodes, got {len(graph.nodes)}" - ) - - -class TestMaxDepthBoundary: - """P1: Test max_depth boundary limits BFS expansion.""" - - @pytest.mark.asyncio - async def test_max_depth_boundary( - self, - mock_lsp_bridge: AsyncMock, - seed_nodes: List[CodeSymbolNode], - ) -> None: - """Test BFS queue does not add nodes beyond max_depth. - - Input: max_depth=2 - Mock: Multi-level expansion responses - Assert: BFS queue stops adding new nodes when depth > 2 - """ - # Track which depths are expanded - expanded_depths = set() - - def create_ref_for_depth(depth: int) -> Location: - return Location( - file_path=f"depth{depth}.py", - line=depth * 10 + 1, - character=0, - ) - - async def mock_get_references(node: CodeSymbolNode) -> List[Location]: - """Return references based on node's apparent depth.""" - # Determine which depth level this node represents - if node.file_path == "main.py": - expanded_depths.add(0) - return [create_ref_for_depth(1)] - elif "depth1" in node.file_path: - expanded_depths.add(1) - return [create_ref_for_depth(2)] - elif "depth2" in node.file_path: - expanded_depths.add(2) - return [create_ref_for_depth(3)] - elif "depth3" in node.file_path: - expanded_depths.add(3) - return [create_ref_for_depth(4)] - return [] - - mock_lsp_bridge.get_references.side_effect = mock_get_references - mock_lsp_bridge.get_call_hierarchy.return_value = [] - mock_lsp_bridge.get_document_symbols.return_value = [] - - builder = LspGraphBuilder(max_depth=2, max_nodes=100, max_concurrent=10) - graph = await builder.build_from_seeds(seed_nodes, mock_lsp_bridge) - - # Collect file paths from graph - node_files = [node.file_path for node in graph.nodes.values()] - - # Should have: seed (main.py), depth1 (from seed expansion), depth2 (from depth1 expansion) - # depth3 should only be added to graph but NOT expanded (depth > max_depth=2) - assert "main.py" in node_files, "Seed node should be in graph" - assert any("depth1" in f for f in node_files), "Depth 1 node should be in graph" - assert any("depth2" in f for f in node_files), "Depth 2 node should be in graph" - - # The depth3 node might be added to the graph (from depth2 expansion) - # but should NOT be expanded (no depth4 nodes should exist) - depth4_nodes = [f for f in node_files if "depth4" in f] - assert len(depth4_nodes) == 0, ( - f"Nodes beyond max_depth should not be expanded: {depth4_nodes}" - ) - - # Verify expansion didn't go to depth 3 (would mean depth4 nodes were created) - # The depth 3 node itself may be in the graph but shouldn't have been expanded - assert 3 not in expanded_depths or 4 not in expanded_depths, ( - f"Expansion should stop at max_depth, expanded depths: {expanded_depths}" - ) - - -class TestConcurrentSemaphore: - """P1: Test concurrent semaphore limits parallel expansion.""" - - @pytest.mark.asyncio - async def test_concurrent_semaphore( - self, - mock_lsp_bridge: AsyncMock, - ) -> None: - """Test that concurrent node expansions are limited by semaphore. - - Input: max_concurrent=3, 10 nodes in queue - Assert: Simultaneous _expand_node calls never exceed 3 - """ - concurrent_count = {"current": 0, "max_seen": 0} - lock = asyncio.Lock() - - # Create multiple seed nodes - seeds = [ - CodeSymbolNode( - id=f"file{i}.py:func{i}:{i}", - name=f"func{i}", - kind="function", - file_path=f"file{i}.py", - range=Range( - start_line=i, - start_character=0, - end_line=i + 10, - end_character=0, - ), - ) - for i in range(10) - ] - - original_get_refs = mock_lsp_bridge.get_references - - async def tracked_get_references(node: CodeSymbolNode) -> List[Location]: - """Track concurrent calls to verify semaphore behavior.""" - async with lock: - concurrent_count["current"] += 1 - if concurrent_count["current"] > concurrent_count["max_seen"]: - concurrent_count["max_seen"] = concurrent_count["current"] - - # Simulate some work - await asyncio.sleep(0.01) - - async with lock: - concurrent_count["current"] -= 1 - - return [] - - mock_lsp_bridge.get_references.side_effect = tracked_get_references - mock_lsp_bridge.get_call_hierarchy.return_value = [] - mock_lsp_bridge.get_document_symbols.return_value = [] - - builder = LspGraphBuilder(max_depth=1, max_nodes=100, max_concurrent=3) - await builder.build_from_seeds(seeds, mock_lsp_bridge) - - # Verify concurrent calls never exceeded max_concurrent - assert concurrent_count["max_seen"] <= 3, ( - f"Max concurrent calls ({concurrent_count['max_seen']}) exceeded limit (3)" - ) - - -class TestDocumentSymbolCache: - """P1: Test document symbol caching for same file locations.""" - - @pytest.mark.asyncio - async def test_document_symbol_cache( - self, - mock_lsp_bridge: AsyncMock, - seed_nodes: List[CodeSymbolNode], - ) -> None: - """Test that document symbols are cached per file. - - Input: 2 locations from the same file - Mock: get_document_symbols only called once - Assert: Second location lookup uses cache - """ - # Two references from the same file - refs_same_file = [ - Location(file_path="shared.py", line=10, character=0), - Location(file_path="shared.py", line=20, character=0), - ] - - mock_lsp_bridge.get_references.return_value = refs_same_file - mock_lsp_bridge.get_call_hierarchy.return_value = [] - - doc_symbols_call_count = {"count": 0} - - async def mock_get_document_symbols(file_path: str) -> List[Dict[str, Any]]: - doc_symbols_call_count["count"] += 1 - return [ - { - "name": "symbol_at_10", - "kind": 12, - "range": { - "start": {"line": 9, "character": 0}, - "end": {"line": 15, "character": 0}, - }, - }, - { - "name": "symbol_at_20", - "kind": 12, - "range": { - "start": {"line": 19, "character": 0}, - "end": {"line": 25, "character": 0}, - }, - }, - ] - - mock_lsp_bridge.get_document_symbols.side_effect = mock_get_document_symbols - - builder = LspGraphBuilder(max_depth=1, max_nodes=100, max_concurrent=10) - await builder.build_from_seeds(seed_nodes, mock_lsp_bridge) - - # get_document_symbols should be called only once for shared.py - assert doc_symbols_call_count["count"] == 1, ( - f"Expected 1 call to get_document_symbols, got {doc_symbols_call_count['count']}" - ) - - # Verify cache contains the file - assert "shared.py" in builder._document_symbols_cache - - @pytest.mark.asyncio - async def test_cache_cleared_between_builds( - self, - mock_lsp_bridge: AsyncMock, - seed_nodes: List[CodeSymbolNode], - ) -> None: - """Test that clear_cache removes cached document symbols.""" - mock_lsp_bridge.get_references.return_value = [] - mock_lsp_bridge.get_call_hierarchy.return_value = [] - mock_lsp_bridge.get_document_symbols.return_value = [] - - builder = LspGraphBuilder(max_depth=1, max_nodes=100, max_concurrent=10) - - # Manually populate cache - builder._document_symbols_cache["test.py"] = [{"name": "cached"}] - - # Clear cache - builder.clear_cache() - - # Verify cache is empty - assert len(builder._document_symbols_cache) == 0 - - -class TestNodeExpansionErrorHandling: - """P2: Test error handling during node expansion.""" - - @pytest.mark.asyncio - async def test_node_expansion_error_handling( - self, - mock_lsp_bridge: AsyncMock, - ) -> None: - """Test that errors in node expansion are logged and other nodes continue. - - Mock: get_references throws exception for specific node - Assert: Error is logged, other nodes continue expanding - """ - seeds = [ - CodeSymbolNode( - id="good.py:good:1", - name="good", - kind="function", - file_path="good.py", - range=Range(start_line=1, start_character=0, end_line=10, end_character=0), - ), - CodeSymbolNode( - id="bad.py:bad:1", - name="bad", - kind="function", - file_path="bad.py", - range=Range(start_line=1, start_character=0, end_line=10, end_character=0), - ), - ] - - async def mock_get_references(node: CodeSymbolNode) -> List[Location]: - if "bad" in node.file_path: - raise RuntimeError("Simulated LSP error") - return [Location(file_path="result.py", line=5, character=0)] - - mock_lsp_bridge.get_references.side_effect = mock_get_references - mock_lsp_bridge.get_call_hierarchy.return_value = [] - mock_lsp_bridge.get_document_symbols.return_value = [] - - builder = LspGraphBuilder(max_depth=1, max_nodes=100, max_concurrent=10) - - # Should not raise, error should be caught and logged - graph = await builder.build_from_seeds(seeds, mock_lsp_bridge) - - # Both seed nodes should be in the graph - assert "good.py:good:1" in graph.nodes - assert "bad.py:bad:1" in graph.nodes - - # The good node's expansion should have succeeded - # (result.py node should be present) - result_nodes = [n for n in graph.nodes.keys() if "result.py" in n] - assert len(result_nodes) >= 1, "Good node's expansion should have succeeded" - - @pytest.mark.asyncio - async def test_partial_failure_continues_expansion( - self, - mock_lsp_bridge: AsyncMock, - seed_nodes: List[CodeSymbolNode], - ) -> None: - """Test that failure in one LSP call doesn't stop other calls.""" - # References succeed, call hierarchy fails - mock_lsp_bridge.get_references.return_value = [ - Location(file_path="ref.py", line=5, character=0) - ] - mock_lsp_bridge.get_call_hierarchy.side_effect = RuntimeError("Call hierarchy failed") - mock_lsp_bridge.get_document_symbols.return_value = [] - - builder = LspGraphBuilder(max_depth=1, max_nodes=100, max_concurrent=10) - graph = await builder.build_from_seeds(seed_nodes, mock_lsp_bridge) - - # Should still have the seed and the reference node - assert len(graph.nodes) >= 2 - - # Reference edge should exist - ref_edges = [e for e in graph.edges if e[2] == "references"] - assert len(ref_edges) >= 1, "Reference edge should exist despite call hierarchy failure" - - -class TestEdgeCases: - """Additional edge case tests.""" - - @pytest.mark.asyncio - async def test_empty_seeds( - self, - mock_lsp_bridge: AsyncMock, - ) -> None: - """Test building graph with empty seed list.""" - builder = LspGraphBuilder(max_depth=2, max_nodes=100, max_concurrent=10) - graph = await builder.build_from_seeds([], mock_lsp_bridge) - - assert len(graph.nodes) == 0 - assert len(graph.edges) == 0 - - @pytest.mark.asyncio - async def test_self_referencing_node_skipped( - self, - mock_lsp_bridge: AsyncMock, - seed_nodes: List[CodeSymbolNode], - ) -> None: - """Test that self-references don't create self-loops.""" - # Reference back to the same node - mock_lsp_bridge.get_references.return_value = [ - Location(file_path="main.py", line=1, character=0) # Same as seed - ] - mock_lsp_bridge.get_call_hierarchy.return_value = [] - mock_lsp_bridge.get_document_symbols.return_value = [ - { - "name": "main", - "kind": 12, - "range": { - "start": {"line": 0, "character": 0}, - "end": {"line": 9, "character": 0}, - }, - } - ] - - builder = LspGraphBuilder(max_depth=1, max_nodes=100, max_concurrent=10) - graph = await builder.build_from_seeds(seed_nodes, mock_lsp_bridge) - - # Should only have the seed node, no self-loop edge - # (Note: depending on implementation, self-references may be filtered) - self_edges = [e for e in graph.edges if e[0] == e[1]] - assert len(self_edges) == 0, "Self-referencing edges should not exist" - - @pytest.mark.asyncio - async def test_visited_nodes_not_expanded_twice( - self, - mock_lsp_bridge: AsyncMock, - seed_nodes: List[CodeSymbolNode], - ) -> None: - """Test that visited nodes are not expanded multiple times.""" - expansion_calls = {"count": 0} - - async def mock_get_references(node: CodeSymbolNode) -> List[Location]: - expansion_calls["count"] += 1 - # Return same node reference each time - return [Location(file_path="shared.py", line=10, character=0)] - - mock_lsp_bridge.get_references.side_effect = mock_get_references - mock_lsp_bridge.get_call_hierarchy.return_value = [] - mock_lsp_bridge.get_document_symbols.return_value = [] - - builder = LspGraphBuilder(max_depth=3, max_nodes=100, max_concurrent=10) - await builder.build_from_seeds(seed_nodes, mock_lsp_bridge) - - # Each unique node should only be expanded once - # seed (main.py) + shared.py = 2 expansions max - assert expansion_calls["count"] <= 2, ( - f"Nodes should not be expanded multiple times, got {expansion_calls['count']} calls" - ) diff --git a/codex-lens/tests/validate_optimizations.py b/codex-lens/tests/validate_optimizations.py deleted file mode 100644 index a8445a9d..00000000 --- a/codex-lens/tests/validate_optimizations.py +++ /dev/null @@ -1,287 +0,0 @@ -""" -Manual validation script for performance optimizations. - -This script verifies that the optimization implementations are working correctly. -Run with: python tests/validate_optimizations.py -""" - -import json -import sqlite3 -import tempfile -import time -from pathlib import Path - -from codexlens.storage.dir_index import DirIndexStore -from codexlens.storage.registry import RegistryStore -from codexlens.storage.migration_manager import MigrationManager -from codexlens.storage.migrations import migration_001_normalize_keywords - - -def test_keyword_normalization(): - """Test normalized keywords functionality.""" - print("\n=== Testing Keyword Normalization ===") - - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "test_index.db" - store = DirIndexStore(db_path) - store.initialize() # Create schema - - # Add a test file - # Note: add_file automatically calculates mtime and line_count - file_id = store.add_file( - name="test.py", - full_path=Path("/test/test.py"), - content="def hello(): pass", - language="python" - ) - - # Add semantic metadata with keywords - keywords = ["auth", "security", "jwt"] - store.add_semantic_metadata( - file_id=file_id, - summary="Test summary", - keywords=keywords, - purpose="Testing", - llm_tool="gemini" - ) - - conn = store._get_connection() - - # Verify keywords table populated - keyword_rows = conn.execute(""" - SELECT k.keyword - FROM file_keywords fk - JOIN keywords k ON fk.keyword_id = k.id - WHERE fk.file_id = ? - """, (file_id,)).fetchall() - - normalized_keywords = [row["keyword"] for row in keyword_rows] - print(f"✓ Keywords stored in normalized tables: {normalized_keywords}") - assert set(normalized_keywords) == set(keywords), "Keywords mismatch!" - - # Test optimized search - results = store.search_semantic_keywords("auth", use_normalized=True) - print(f"✓ Found {len(results)} file(s) with keyword 'auth'") - assert len(results) > 0, "No results found!" - - # Test fallback search - results_fallback = store.search_semantic_keywords("auth", use_normalized=False) - print(f"✓ Fallback search found {len(results_fallback)} file(s)") - assert len(results) == len(results_fallback), "Result count mismatch!" - - store.close() - print("✓ Keyword normalization tests PASSED") - - -def test_path_lookup_optimization(): - """Test optimized path lookup.""" - print("\n=== Testing Path Lookup Optimization ===") - - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "test_registry.db" - store = RegistryStore(db_path) - - # Add directory mapping - store.add_dir_mapping( - source_path=Path("/a/b/c"), - index_path=Path("/tmp/index.db"), - project_id=None - ) - - # Test deep path lookup - deep_path = Path("/a/b/c/d/e/f/g/h/i/j/file.py") - - start = time.perf_counter() - result = store.find_nearest_index(deep_path) - elapsed = time.perf_counter() - start - - print(f"✓ Found nearest index in {elapsed*1000:.2f}ms") - assert result is not None, "No result found!" - assert result.source_path == Path("/a/b/c"), "Wrong path found!" - assert elapsed < 0.05, f"Too slow: {elapsed*1000:.2f}ms" - - store.close() - print("✓ Path lookup optimization tests PASSED") - - -def test_symbol_search_prefix_mode(): - """Test symbol search with prefix mode.""" - print("\n=== Testing Symbol Search Prefix Mode ===") - - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "test_index.db" - store = DirIndexStore(db_path) - store.initialize() # Create schema - - # Add a test file - file_id = store.add_file( - name="test.py", - full_path=Path("/test/test.py"), - content="def hello(): pass\n" * 10, # 10 lines - language="python" - ) - - # Add symbols - store.add_symbols( - file_id=file_id, - symbols=[ - ("get_user", "function", 1, 5), - ("get_item", "function", 6, 10), - ("create_user", "function", 11, 15), - ("UserClass", "class", 16, 25), - ] - ) - - # Test prefix search - results = store.search_symbols("get", prefix_mode=True) - print(f"✓ Prefix search for 'get' found {len(results)} symbol(s)") - assert len(results) == 2, f"Expected 2 symbols, got {len(results)}" - for symbol in results: - assert symbol.name.startswith("get"), f"Symbol {symbol.name} doesn't start with 'get'" - print(f" Symbols: {[s.name for s in results]}") - - # Test substring search - results_sub = store.search_symbols("user", prefix_mode=False) - print(f"✓ Substring search for 'user' found {len(results_sub)} symbol(s)") - assert len(results_sub) == 3, f"Expected 3 symbols, got {len(results_sub)}" - print(f" Symbols: {[s.name for s in results_sub]}") - - store.close() - print("✓ Symbol search optimization tests PASSED") - - -def test_migration_001(): - """Test migration_001 execution.""" - print("\n=== Testing Migration 001 ===") - - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "test_index.db" - store = DirIndexStore(db_path) - store.initialize() # Create schema - conn = store._get_connection() - - # Add test data to semantic_metadata - conn.execute(""" - INSERT INTO files(id, name, full_path, language, mtime, line_count) - VALUES(1, 'test.py', '/test.py', 'python', 0, 10) - """) - conn.execute(""" - INSERT INTO semantic_metadata(file_id, keywords) - VALUES(1, ?) - """, (json.dumps(["test", "migration", "keyword"]),)) - conn.commit() - - # Run migration - print(" Running migration_001...") - migration_001_normalize_keywords.upgrade(conn) - print(" Migration completed successfully") - - # Verify migration results - keyword_count = conn.execute(""" - SELECT COUNT(*) as c FROM file_keywords WHERE file_id=1 - """).fetchone()["c"] - - print(f"✓ Migrated {keyword_count} keywords for file_id=1") - assert keyword_count == 3, f"Expected 3 keywords, got {keyword_count}" - - # Verify keywords table - keywords = conn.execute(""" - SELECT k.keyword FROM keywords k - JOIN file_keywords fk ON k.id = fk.keyword_id - WHERE fk.file_id = 1 - """).fetchall() - keyword_list = [row["keyword"] for row in keywords] - print(f" Keywords: {keyword_list}") - - store.close() - print("✓ Migration 001 tests PASSED") - - -def test_performance_comparison(): - """Compare performance of optimized vs fallback implementations.""" - print("\n=== Performance Comparison ===") - - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "test_index.db" - store = DirIndexStore(db_path) - store.initialize() # Create schema - - # Create test data - print(" Creating test data...") - for i in range(100): - file_id = store.add_file( - name=f"file_{i}.py", - full_path=Path(f"/test/file_{i}.py"), - content=f"def function_{i}(): pass", - language="python" - ) - - # Vary keywords - if i % 3 == 0: - keywords = ["auth", "security"] - elif i % 3 == 1: - keywords = ["database", "query"] - else: - keywords = ["api", "endpoint"] - - store.add_semantic_metadata( - file_id=file_id, - summary=f"File {i}", - keywords=keywords, - purpose="Testing", - llm_tool="gemini" - ) - - # Benchmark normalized search - print(" Benchmarking normalized search...") - start = time.perf_counter() - for _ in range(10): - results_norm = store.search_semantic_keywords("auth", use_normalized=True) - norm_time = time.perf_counter() - start - - # Benchmark fallback search - print(" Benchmarking fallback search...") - start = time.perf_counter() - for _ in range(10): - results_fallback = store.search_semantic_keywords("auth", use_normalized=False) - fallback_time = time.perf_counter() - start - - print(f"\n Results:") - print(f" - Normalized search: {norm_time*1000:.2f}ms (10 iterations)") - print(f" - Fallback search: {fallback_time*1000:.2f}ms (10 iterations)") - print(f" - Speedup factor: {fallback_time/norm_time:.2f}x") - print(f" - Both found {len(results_norm)} files") - - assert len(results_norm) == len(results_fallback), "Result count mismatch!" - - store.close() - print("✓ Performance comparison PASSED") - - -def main(): - """Run all validation tests.""" - print("=" * 60) - print("CodexLens Performance Optimizations Validation") - print("=" * 60) - - try: - test_keyword_normalization() - test_path_lookup_optimization() - test_symbol_search_prefix_mode() - test_migration_001() - test_performance_comparison() - - print("\n" + "=" * 60) - print("✓✓✓ ALL VALIDATION TESTS PASSED ✓✓✓") - print("=" * 60) - return 0 - - except Exception as e: - print(f"\nX VALIDATION FAILED: {e}") - import traceback - traceback.print_exc() - return 1 - - -if __name__ == "__main__": - exit(main()) diff --git a/codex-lens/verify_watcher.py b/codex-lens/verify_watcher.py deleted file mode 100644 index f64ff089..00000000 --- a/codex-lens/verify_watcher.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python3 -"""Verification script for FileWatcher event filtering and debouncing.""" - -import time -from pathlib import Path -from codexlens.watcher.file_watcher import FileWatcher -from codexlens.watcher.events import WatcherConfig, FileEvent - -def test_should_index_file(): - """Test _should_index_file filtering logic.""" - print("Testing _should_index_file filtering...") - - # Create watcher instance - config = WatcherConfig() - watcher = FileWatcher( - root_path=Path("."), - config=config, - on_changes=lambda events: None, - ) - - # Test cases - test_cases = [ - # (path, expected_result, description) - (Path("test.py"), True, "Python file should be indexed"), - (Path("test.txt"), True, "Text file should be indexed"), - (Path("test.js"), True, "JavaScript file should be indexed"), - (Path("test.ts"), True, "TypeScript file should be indexed"), - (Path("src/test.py"), True, "Python file in subdirectory should be indexed"), - (Path(".git/config"), False, ".git files should be filtered"), - (Path("node_modules/pkg/index.js"), False, "node_modules should be filtered"), - (Path("__pycache__/test.pyc"), False, "__pycache__ should be filtered"), - (Path(".venv/lib/test.py"), False, ".venv should be filtered"), - (Path("test.unknown"), False, "Unknown extension should be filtered"), - (Path("README.md"), True, "Markdown file should be indexed"), - ] - - passed = 0 - failed = 0 - - for path, expected, description in test_cases: - result = watcher._should_index_file(path) - status = "✓" if result == expected else "✗" - - if result == expected: - passed += 1 - else: - failed += 1 - - print(f" {status} {description}") - print(f" Path: {path}, Expected: {expected}, Got: {result}") - - print(f"\nResults: {passed} passed, {failed} failed") - return failed == 0 - -def test_debounce_and_dedup(): - """Test event debouncing and deduplication.""" - print("\nTesting event debouncing and deduplication...") - - received_events = [] - - def on_changes(events): - received_events.append(events) - print(f" Received batch: {len(events)} events") - - # Create watcher with short debounce time for testing - config = WatcherConfig(debounce_ms=500) - watcher = FileWatcher( - root_path=Path("."), - config=config, - on_changes=on_changes, - ) - - # Simulate rapid events to same file (should be deduplicated) - from codexlens.watcher.events import ChangeType - - test_path = Path("test_file.py") - for i in range(5): - event = FileEvent( - path=test_path, - change_type=ChangeType.MODIFIED, - timestamp=time.time(), - ) - watcher._on_raw_event(event) - - # Wait for debounce - time.sleep(0.6) - - # Force flush to ensure we get the events - watcher._flush_events() - - if received_events: - batch = received_events[0] - # Should deduplicate 5 events to 1 - if len(batch) == 1: - print(" ✓ Deduplication working: 5 events reduced to 1") - return True - else: - print(f" ✗ Deduplication failed: expected 1 event, got {len(batch)}") - return False - else: - print(" ✗ No events received") - return False - -if __name__ == "__main__": - print("=" * 60) - print("FileWatcher Verification") - print("=" * 60) - - test1 = test_should_index_file() - test2 = test_debounce_and_dedup() - - print("\n" + "=" * 60) - if test1 and test2: - print("✓ All tests passed!") - else: - print("✗ Some tests failed") - print("=" * 60)