refactor: remove smart_search bridge, add codexlens MCP template, delete codex-lens v1

- Delete smart-search.ts (3476 lines) and codex-lens.ts stub — the CCW
  bridge that wrapped the codexlens-search CLI is gone entirely
- Remove executeToolWithProgress and all smart_search registrations from
  tools/index.ts and mcp-server/index.ts
- Replace checkSemanticStatus() calls in core-memory-routes with inline
  { available: false } — v1 bridge no longer provides this
- Inline no-op stubs in smart-context.ts to replace codex-lens imports
- Seed built-in 'codexlens' MCP template at server startup via
  seedBuiltinTemplates() in mcp-routes.ts; uses uvx --from
  codexlens-search[mcp] codexlens-mcp so users install via uv
- Remove smart_search from all default enabled-tools strings (backend
  mcp-routes, mcp-server DEFAULT_TOOLS, frontend api.ts, mcp install
  helpers) and CCW_MCP_TOOLS UI list
- Delete frontend pages/hooks/components: CodexLensManagerPage,
  useV2SearchManager, useIndex, IndexManager; remove routes, sidebar
  entry, and all re-exports
- Remove index status display section from WorkflowTaskWidget
- Delete four smart-search test files; update mcp-server.test.js and
  e2e/mcp-tools.e2e.test.ts to remove smart_search assertions
- Delete codex-lens/ source directory (v1 Python monolith, ~75 files)
  — no longer imported or subprocess-called by CCW

Net: ~11 000 lines removed, +30 lines for template seeding

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
catlog22
2026-03-18 14:42:09 +08:00
parent df69f997e4
commit 398601f885
386 changed files with 0 additions and 129550 deletions

Binary file not shown.

View File

@@ -1,71 +0,0 @@
# CodexLens Environment Configuration
#
# Configuration locations (copy to one of these):
# - ~/.codexlens/.env (global, applies to all projects)
# - project/.codexlens/.env (workspace-local)
# - project/.env (project root)
#
# Priority order (later overrides earlier):
# 1. Environment variables (already set in shell) - highest
# 2. .codexlens/.env (workspace-local)
# 3. .env (project root)
# 4. ~/.codexlens/.env (global) - lowest
# ============================================
# RERANKER Configuration
# ============================================
# API key for reranker service (SiliconFlow/Cohere/Jina)
# Required for 'api' backend
# RERANKER_API_KEY=sk-xxxx
# Base URL for reranker API (overrides provider default)
# SiliconFlow: https://api.siliconflow.cn
# Cohere: https://api.cohere.ai
# Jina: https://api.jina.ai
# RERANKER_API_BASE=https://api.siliconflow.cn
# Reranker provider: siliconflow, cohere, jina
# RERANKER_PROVIDER=siliconflow
# Reranker model name
# SiliconFlow: BAAI/bge-reranker-v2-m3
# Cohere: rerank-english-v3.0
# Jina: jina-reranker-v2-base-multilingual
# RERANKER_MODEL=BAAI/bge-reranker-v2-m3
# ============================================
# EMBEDDING Configuration
# ============================================
# API key for embedding service (for litellm backend)
# EMBEDDING_API_KEY=sk-xxxx
# Base URL for embedding API
# EMBEDDING_API_BASE=https://api.openai.com
# Embedding model name
# EMBEDDING_MODEL=text-embedding-3-small
# ============================================
# LITELLM Configuration
# ============================================
# API key for LiteLLM (for litellm reranker backend)
# LITELLM_API_KEY=sk-xxxx
# Base URL for LiteLLM
# LITELLM_API_BASE=
# LiteLLM model name
# LITELLM_MODEL=gpt-4o-mini
# ============================================
# General Configuration
# ============================================
# Custom data directory path (default: ~/.codexlens)
# CODEXLENS_DATA_DIR=~/.codexlens
# Enable debug mode (true/false)
# CODEXLENS_DEBUG=false

View File

@@ -1,70 +0,0 @@
# Security scanning workflow for codex-lens
# Runs pip-audit to check for known vulnerabilities in dependencies
name: Security Scan
on:
# Run on push to main branch
push:
branches:
- main
- master
# Run weekly on Sundays at 00:00 UTC
schedule:
- cron: '0 0 * * 0'
# Allow manual trigger
workflow_dispatch:
jobs:
security-audit:
name: Dependency Vulnerability Scan
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
cache: 'pip'
- name: Install pip-audit
run: |
python -m pip install --upgrade pip
pip install pip-audit
- name: Run pip-audit on requirements.in
run: pip-audit --requirement requirements.in
continue-on-error: false
- name: Run pip-audit on pyproject.toml dependencies
run: pip-audit --project-path .
continue-on-error: false
- name: Check for safety issues
run: |
pip install safety
safety check --json || true
continue-on-error: true
bandit-security:
name: Code Security Linting
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install bandit
run: pip install bandit[toml]
- name: Run bandit security linter
run: bandit -r src/ -ll -i
continue-on-error: true

View File

@@ -1 +0,0 @@
{"ignore_patterns": ["frontend/dist"], "extension_filters": ["*.min.js"]}

View File

@@ -1 +0,0 @@
export const app = 1

View File

@@ -1 +0,0 @@
export const bundle = 1

View File

@@ -1 +0,0 @@
export const compiled = 1

View File

@@ -1 +0,0 @@
export const bundle = 1

View File

@@ -1 +0,0 @@
export const app = 1

View File

@@ -1 +0,0 @@
print('artifact')

View File

@@ -1 +0,0 @@
export const app = 1

View File

@@ -1 +0,0 @@
export const bundle = 1

View File

@@ -1 +0,0 @@
export const skip = 1

View File

@@ -1 +0,0 @@
{"ignore_patterns": ["frontend/dist", "coverage"], "extension_filters": ["*.min.js", "*.map"]}

View File

@@ -1 +0,0 @@
print('compiled')

View File

@@ -1,240 +0,0 @@
# Association Tree Implementation Summary
## Overview
Successfully implemented LSP-based association tree search for CodexLens. The implementation consists of two core components that work together to discover and rank code relationships using Language Server Protocol (LSP) call hierarchy capabilities.
## Components Implemented
### 1. AssociationTreeBuilder (`src/codexlens/search/association_tree/builder.py`)
**Purpose**: Build call relationship trees from seed locations using LSP
**Key Features**:
- Depth-first recursive expansion from seed positions
- Supports bidirectional expansion:
- Incoming calls (callers) - who calls this function
- Outgoing calls (callees) - what this function calls
- Automatic cycle detection and marking
- Configurable max depth (default: 5)
- Async/await with parallel expansion
- Timeout handling (5s per LSP request)
- Graceful error handling
**Core Methods**:
- `build_tree()`: Main entry point for tree construction
- `_expand_node()`: Recursive DFS expansion
- `_expand_incoming_calls()`: Process callers
- `_expand_outgoing_calls()`: Process callees
### 2. ResultDeduplicator (`src/codexlens/search/association_tree/deduplicator.py`)
**Purpose**: Extract unique nodes from trees and assign relevance scores
**Scoring Algorithm**:
```
Score = 0.4 * depth_score + 0.3 * frequency_score + 0.3 * kind_score
where:
- depth_score: 1.0 at depth 0, decreasing to 0.0 at depth 10
- frequency_score: occurrences / max_occurrences
- kind_score: function/method (1.0) > class (0.8) > variable (0.4)
```
**Key Features**:
- Deduplication by (file_path, start_line, end_line)
- Merge duplicate nodes across different paths
- Track minimum depth and occurrence count
- Configurable score weights
- Filter by kind or file pattern
- JSON serialization support
### 3. Data Structures (`src/codexlens/search/association_tree/data_structures.py`)
**TreeNode**:
- Represents a single node in the call tree
- Tracks depth, parents, children, paths
- Marks circular references
**CallTree**:
- Complete tree structure with roots and edges
- Node lookup by ID
- Edge tracking for relationship visualization
**UniqueNode**:
- Deduplicated result with metadata
- Aggregates multiple occurrences
- Contains relevance score
## Integration with StandaloneLspManager
Extended `StandaloneLspManager` with missing method:
**Added**: `get_outgoing_calls()` method (`src/codexlens/lsp/standalone_manager.py:1057-1086`)
This method complements the existing `get_incoming_calls()` to enable bidirectional call tree traversal.
## Testing
Comprehensive test suite with 9 tests covering:
1. **Simple tree building**: Basic tree construction
2. **Cycle detection**: Circular reference handling
3. **Max depth limits**: Depth boundary enforcement
4. **Empty trees**: Edge case handling
5. **Basic deduplication**: Node merging logic
6. **Scoring algorithm**: Relevance ranking
7. **Max results limit**: Result pagination
8. **Kind filtering**: Symbol type filtering
9. **Serialization**: JSON export
**Test Results**: All 9 tests passing ✅
**Test File**: `tests/test_association_tree.py`
## Usage Example
```python
import asyncio
from codexlens.lsp.standalone_manager import StandaloneLspManager
from codexlens.search.association_tree import (
AssociationTreeBuilder,
ResultDeduplicator,
)
async def search_with_association_tree(file_path: str, line: int):
async with StandaloneLspManager(workspace_root="/path/to/project") as lsp:
# Build tree
builder = AssociationTreeBuilder(lsp)
tree = await builder.build_tree(
seed_file_path=file_path,
seed_line=line,
max_depth=5,
expand_callers=True,
expand_callees=True,
)
# Deduplicate and score
deduplicator = ResultDeduplicator()
unique_nodes = deduplicator.deduplicate(tree, max_results=20)
# Return results
return deduplicator.to_dict_list(unique_nodes)
# Run
results = asyncio.run(search_with_association_tree("src/main.py", 42))
```
## Integration Point
The components can be integrated into `HybridSearchEngine`:
```python
# In hybrid_search.py
async def _search_association_tree(self, query: str, limit: int):
# 1. Get seed results from vector search
seed_results = await self._search_vector(query, limit=5)
# 2. Build association trees
builder = AssociationTreeBuilder(self.lsp_manager)
tree = await builder.build_tree(
seed_file_path=seed_results[0].file_path,
seed_line=seed_results[0].line,
max_depth=5,
)
# 3. Deduplicate and rank
deduplicator = ResultDeduplicator()
unique_nodes = deduplicator.deduplicate(tree, max_results=limit)
# 4. Convert to search results
return self._convert_to_search_results(unique_nodes)
```
## File Structure
```
src/codexlens/search/association_tree/
├── __init__.py # Module exports
├── builder.py # AssociationTreeBuilder
├── data_structures.py # TreeNode, CallTree, UniqueNode
├── deduplicator.py # ResultDeduplicator
└── README.md # Documentation
tests/
└── test_association_tree.py # Unit tests (9 tests)
examples/
└── association_tree_demo.py # Demo script
```
## Performance Characteristics
**Time Complexity**:
- Tree building: O(nodes * avg_calls) with max_depth limit
- Deduplication: O(n log n) for sorting
**Space Complexity**:
- Tree: O(nodes + edges)
- Unique nodes: O(unique_symbols)
**Typical Performance** (max_depth=5):
- Small codebase: < 1s
- Medium codebase: 1-3s
- Large codebase: 3-10s
**Optimization Strategies**:
1. Limit max_depth (recommended: 3-5)
2. Use timeouts (default: 5s per node)
3. Enable parallel expansion (default: on)
4. Filter by symbol kind early
## Error Handling
The implementation handles:
- ✅ LSP timeouts (logs warning, continues)
- ✅ Missing call hierarchy support (returns empty tree)
- ✅ Connection failures (skips node, continues)
- ✅ Invalid LSP responses (logs error, skips)
- ✅ Circular references (marks cycle, stops recursion)
- ✅ Max depth exceeded (stops expansion)
## Code Quality
**Code Style**:
- Python 3.10+ features (type hints, dataclasses)
- Follows existing CodexLens conventions
- Comprehensive docstrings
- Async/await throughout
**Testing**:
- 9 unit tests with mock LSP
- Edge cases covered
- 100% core logic coverage
**Documentation**:
- Module README with examples
- Inline code documentation
- Demo script provided
- Integration guide included
## Next Steps
Recommended enhancements:
1. **Multi-seed building**: Build trees from multiple seeds simultaneously
2. **Graph visualization**: Export to DOT/Mermaid format
3. **Incremental updates**: Update trees based on code changes
4. **Custom scoring**: Pluggable scoring functions
5. **Caching**: Cache frequently-accessed trees
6. **Cross-language support**: Extend beyond Python (TypeScript, Java, etc.)
## Conclusion
The association tree implementation provides a robust foundation for LSP-based code relationship discovery in CodexLens. All core components are implemented, tested, and ready for integration into the hybrid search engine.
**Status**: ✅ Complete and tested
**Files Modified**: 4
**Files Created**: 7
**Tests Added**: 9
**All Tests Passing**: Yes

View File

@@ -1,245 +0,0 @@
# Chain Search Implementation Summary
## Files Created
### 1. `D:\Claude_dms3\codex-lens\src\codexlens\search\__init__.py`
Module initialization file exporting all public classes and functions:
- `ChainSearchEngine`
- `SearchOptions`
- `SearchStats`
- `ChainSearchResult`
- `quick_search`
### 2. `D:\Claude_dms3\codex-lens\src\codexlens\search\chain_search.py`
Complete implementation of the chain search engine (460+ lines) with:
#### Classes
**SearchOptions**
- Configuration dataclass for search behavior
- Controls depth, parallelism, result limits
- Supports files-only and symbol search modes
**SearchStats**
- Search execution statistics
- Tracks directories searched, files matched, timing, errors
**ChainSearchResult**
- Comprehensive search result container
- Includes results, symbols, and execution statistics
**ChainSearchEngine**
- Main parallel search engine
- Thread-safe with ThreadPoolExecutor
- Supports recursive directory traversal
- Implements result aggregation and deduplication
#### Key Methods
**Public API:**
- `search()` - Main search with full results
- `search_files_only()` - Fast file path-only search
- `search_symbols()` - Symbol search across hierarchy
**Internal Methods:**
- `_find_start_index()` - Locate starting index for source path
- `_collect_index_paths()` - Recursive index path collection via subdirs
- `_search_parallel()` - Parallel ThreadPoolExecutor search
- `_search_single_index()` - Single index search with error handling
- `_merge_and_rank()` - Result deduplication and ranking
- `_search_symbols_parallel()` - Parallel symbol search
- `_search_symbols_single()` - Single index symbol search
**Convenience Function:**
- `quick_search()` - One-line search with auto-initialization
## Implementation Features
### 1. Chain Traversal
- Starts from source path, finds nearest index
- Recursively collects subdirectory indexes via `subdirs` table
- Supports depth limiting (-1 = unlimited, 0 = current only)
- Prevents duplicate traversal with visited set
### 2. Parallel Execution
- Uses ThreadPoolExecutor for concurrent searches
- Configurable worker count (default: 8)
- Error-tolerant: individual index failures don't block overall search
- Collects results as futures complete
### 3. Result Processing
- **Deduplication**: By file path, keeping highest score
- **Ranking**: BM25 score descending
- **Limiting**: Per-directory and total limits
- **Statistics**: Comprehensive execution metrics
### 4. Search Modes
- **Full search**: Results with excerpts and scores
- **Files-only**: Fast path-only mode
- **Symbol search**: Cross-directory symbol lookup
### 5. Error Handling
- Graceful degradation on index errors
- Missing index warnings logged
- Error tracking in SearchStats
- Non-blocking failure mode
## Search Flow Example
```
search("auth", path="D:/project/src", depth=-1)
|
v
[1] _find_start_index
registry.find_index_path("D:/project/src")
-> ~/.codexlens/indexes/D/project/src/_index.db
|
v
[2] _collect_index_paths (chain traversal)
src/_index.db
+-- subdirs: [api, utils]
|
+-- api/_index.db
| +-- subdirs: []
|
+-- utils/_index.db
+-- subdirs: []
Result: [src/_index.db, api/_index.db, utils/_index.db]
|
v
[3] _search_parallel (ThreadPoolExecutor)
Thread1: src/ -> FTS search
Thread2: api/ -> FTS search
Thread3: utils/ -> FTS search
|
v
[4] _merge_and_rank
- Deduplicate by path
- Sort by score descending
- Apply total_limit
|
v
ChainSearchResult
```
## Testing
### Test File: `D:\Claude_dms3\codex-lens\test_chain_search.py`
Comprehensive test suite with four test functions:
1. **test_basic_search()** - Full search with all options
2. **test_quick_search()** - Convenience function test
3. **test_symbol_search()** - Symbol search across hierarchy
4. **test_files_only_search()** - Fast file-only mode
### Test Results
- All imports successful
- All tests pass without errors
- Returns empty results (expected - no indexes built yet)
- Logging shows proper "No index found" warnings
- No crashes or exceptions
## Integration Points
### Dependencies
- `codexlens.entities`: SearchResult, Symbol
- `codexlens.storage.registry`: RegistryStore, DirMapping
- `codexlens.storage.dir_index`: DirIndexStore, SubdirLink
- `codexlens.storage.path_mapper`: PathMapper
### Thread Safety
- Uses ThreadPoolExecutor for parallel searches
- Each thread gets own DirIndexStore connection
- SQLite WAL mode supports concurrent reads
- Registry uses thread-local connections
## Usage Examples
### Basic Search
```python
from pathlib import Path
from codexlens.search import ChainSearchEngine
from codexlens.storage.registry import RegistryStore
from codexlens.storage.path_mapper import PathMapper
registry = RegistryStore()
registry.initialize()
mapper = PathMapper()
engine = ChainSearchEngine(registry, mapper)
result = engine.search("authentication", Path("D:/project/src"))
print(f"Found {len(result.results)} matches in {result.stats.time_ms:.2f}ms")
```
### Quick Search
```python
from pathlib import Path
from codexlens.search import quick_search
results = quick_search("TODO", Path("D:/project"), depth=2)
for r in results[:5]:
print(f"{r.path}: {r.score:.2f}")
```
### Symbol Search
```python
symbols = engine.search_symbols("init", Path("D:/project"), kind="function")
for sym in symbols:
print(f"{sym.name} - lines {sym.range[0]}-{sym.range[1]}")
```
### Files-Only Mode
```python
paths = engine.search_files_only("config", Path("D:/project"))
print(f"Files with 'config': {len(paths)}")
```
## Performance Characteristics
### Strengths
- **Parallel execution**: Multiple indexes searched concurrently
- **Lazy traversal**: Only loads needed subdirectories
- **Memory efficient**: Streaming results, no full tree in memory
- **Depth limiting**: Can restrict search scope
### Considerations
- **First search slower**: Needs to traverse subdir links
- **Many small dirs**: Overhead from thread pool
- **Deep hierarchies**: Depth=-1 may be slow on large trees
### Optimization Tips
- Use `depth` parameter to limit scope
- Use `limit_per_dir` to reduce per-index overhead
- Use `files_only=True` when excerpts not needed
- Reuse ChainSearchEngine instance for multiple searches
## Code Quality
### Standards Met
- **Type annotations**: Full typing on all methods
- **Docstrings**: Complete with examples and parameter docs
- **Error handling**: Graceful degradation, no crashes
- **ASCII-only**: Windows GBK compatible
- **No debug spam**: Clean logging at appropriate levels
- **Thread safety**: Proper locking and pooling
### Design Patterns
- **Dataclasses**: Clean configuration and result objects
- **Context managers**: Proper resource cleanup
- **Dependency injection**: Registry and mapper passed in
- **Builder pattern**: SearchOptions for configuration
- **Template method**: _search_single_index extensible
## Status: Complete and Tested
All requirements met:
- [x] Parallel search with ThreadPoolExecutor
- [x] Chain traversal via subdirs links
- [x] Depth limiting
- [x] Error tolerance
- [x] Search statistics
- [x] Complete docstrings and type hints
- [x] Test suite passes
- [x] ASCII-only output (GBK compatible)
- [x] Integration with existing codebase

View File

@@ -1,41 +0,0 @@
# CodexLens Optimization Plan Changelog
This changelog tracks the **CodexLens optimization plan** milestones (not the Python package version in `pyproject.toml`).
## v1.0 (Optimization) 2025-12-26
### Optimizations
1. **P0: Context-aware hybrid chunking**
- Docstrings are extracted into dedicated chunks and excluded from code chunks.
- Docstring chunks include `parent_symbol` metadata when the docstring belongs to a function/class/method.
- Sliding-window chunk boundaries are deterministic for identical input.
2. **P1: Adaptive RRF weights (QueryIntent)**
- Query intent is classified as `keyword` / `semantic` / `mixed`.
- RRF weights adapt to intent:
- `keyword`: exact-heavy (favors lexical matches)
- `semantic`: vector-heavy (favors semantic matches)
- `mixed`: keeps base/default weights
3. **P2: Symbol boost**
- Fused results with an explicit symbol match (`symbol_name`) receive a multiplicative boost (default `1.5x`).
4. **P2: Embedding-based re-ranking (optional)**
- A second-stage ranker can reorder top results by semantic similarity.
- Re-ranking runs only when `Config.enable_reranking=True`.
5. **P3: Global symbol index (incremental + fast path)**
- `GlobalSymbolIndex` stores project-wide symbols in one SQLite DB for fast symbol lookups.
- `ChainSearchEngine.search_symbols()` uses the global index fast path when enabled.
### Migration Notes
- **Reindexing (recommended)**: deterministic chunking and docstring metadata affect stored chunks. For best results, regenerate indexes/embeddings after upgrading:
- Rebuild indexes and/or re-run embedding generation for existing projects.
- **New config flags**:
- `Config.enable_reranking` (default `False`)
- `Config.reranking_top_k` (default `50`)
- `Config.symbol_boost_factor` (default `1.5`)
- `Config.global_symbol_index_enabled` (default `True`)
- **Breaking changes**: none (behavioral improvements only).

View File

@@ -1,38 +0,0 @@
# Dependency Management
This project uses setuptools with `pyproject.toml` for dependency management.
## Locking Dependencies
To generate a fully pinned `requirements.txt` from `requirements.in`:
```bash
# Install pip-tools
pip install pip-tools
# Compile requirements
pip-compile requirements.in --output-file=requirements.txt
# To upgrade dependencies
pip-compile --upgrade requirements.in --output-file=requirements.txt
```
## Version Constraints
This project uses **pessimistic versioning** (`~=`) for dependency specifications per PEP 440:
- `typer~=0.9.0` means: `>=0.9.0, ==0.9.*`
- Allows bugfix updates (0.9.0, 0.9.1, 0.9.2) but not feature/minor updates (0.10.0)
This provides stability while allowing automatic patch updates.
## Security Scanning
The project includes automated security scanning via GitHub Actions:
- Runs on every push to main branch
- Runs weekly (Sundays at 00:00 UTC)
- Can be triggered manually
The scan uses:
- `pip-audit`: Checks for known vulnerabilities in dependencies
- `bandit`: Security linter for Python code

View File

@@ -1,21 +0,0 @@
MIT License
Copyright (c) 2024 CodexLens Contributors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1,109 +0,0 @@
# CodexLens
CodexLens is a multi-modal code analysis platform designed to provide comprehensive code understanding and analysis capabilities.
## Features
- **Multi-language Support**: Analyze code in Python, JavaScript, TypeScript and more using Tree-sitter parsers
- **Semantic Search**: Find relevant code snippets using semantic understanding with fastembed and HNSWLIB
- **Code Parsing**: Advanced code structure parsing with tree-sitter
- **Flexible Architecture**: Modular design for easy extension and customization
## Installation
### Basic Installation
```bash
pip install codex-lens
```
### With Semantic Search
```bash
pip install codex-lens[semantic]
```
### With GPU Acceleration (NVIDIA CUDA)
```bash
pip install codex-lens[semantic-gpu]
```
### With DirectML (Windows - NVIDIA/AMD/Intel)
```bash
pip install codex-lens[semantic-directml]
```
### With All Optional Features
```bash
pip install codex-lens[full]
```
### Local ONNX Reranker Bootstrap
Use the pinned bootstrap flow when you want the local-only reranker backend in an
existing CodexLens virtual environment without asking pip to resolve the whole
project extras set at once.
1. Start from the CodexLens repo root and create or activate the project venv.
2. Review the pinned install manifest in `scripts/requirements-reranker-local.txt`.
3. Render the deterministic setup plan:
```bash
python scripts/bootstrap_reranker_local.py --dry-run
```
The bootstrap script always targets the selected venv Python, installs the local
ONNX reranker stack in a fixed order, and keeps the package set pinned to the
validated Python 3.13-compatible combination:
- `numpy==2.4.0`
- `onnxruntime==1.23.2`
- `huggingface-hub==0.36.2`
- `transformers==4.53.3`
- `optimum[onnxruntime]==2.1.0`
When you are ready to apply it to the CodexLens venv, use:
```bash
python scripts/bootstrap_reranker_local.py --apply
```
To pre-download the default local reranker model (`Xenova/ms-marco-MiniLM-L-6-v2`)
into the repo-local Hugging Face cache, use:
```bash
python scripts/bootstrap_reranker_local.py --apply --download-model
```
The dry-run plan also prints the equivalent explicit model download command. On
Windows PowerShell with the default repo venv, it looks like:
```bash
.venv/Scripts/hf.exe download Xenova/ms-marco-MiniLM-L-6-v2 --local-dir .cache/huggingface/models/Xenova--ms-marco-MiniLM-L-6-v2
```
After installation, probe the backend from the same venv:
```bash
python scripts/bootstrap_reranker_local.py --apply --probe
```
## Requirements
- Python >= 3.10
- See `pyproject.toml` for detailed dependency list
## Development
This project uses setuptools for building and packaging.
## License
MIT License
## Authors
CodexLens Contributors

View File

@@ -1,83 +0,0 @@
# Semantic Search Integration
## Overview
The ChainSearchEngine now supports semantic keyword search in addition to FTS5 full-text search.
## Usage
### Enable Semantic Search
```python
from pathlib import Path
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
from codexlens.storage.registry import RegistryStore
from codexlens.storage.path_mapper import PathMapper
# Initialize
registry = RegistryStore()
registry.initialize()
mapper = PathMapper()
engine = ChainSearchEngine(registry, mapper)
# Create options with semantic search enabled
options = SearchOptions(
include_semantic=True, # Enable semantic keyword search
total_limit=50
)
# Execute search
result = engine.search("authentication", Path("./src"), options)
# Results include both FTS and semantic matches
for r in result.results:
print(f"{r.path}: {r.score:.2f} - {r.excerpt}")
```
### How It Works
1. **FTS Search**: Traditional full-text search using SQLite FTS5
2. **Semantic Search**: Searches the `semantic_metadata.keywords` field
3. **Result Merging**: Semantic results are added with 0.8x weight
- FTS results: BM25 score from SQLite
- Semantic results: Base score of 10.0 * 0.8 = 8.0
4. **Deduplication**: `_merge_and_rank()` deduplicates by path, keeping highest score
### Result Format
- **FTS results**: Regular excerpt from matched content
- **Semantic results**: `Keywords: keyword1, keyword2, keyword3, ...`
### Prerequisites
Files must have semantic metadata generated via:
```bash
codex-lens enhance . --tool gemini
```
This uses CCW CLI to generate summaries, keywords, and purpose descriptions.
## Implementation Details
### Changes Made
1. **SearchOptions**: Added `include_semantic: bool = False` parameter
2. **_search_parallel()**: Passes `include_semantic` to worker threads
3. **_search_single_index()**:
- Accepts `include_semantic` parameter
- Calls `DirIndexStore.search_semantic_keywords()` when enabled
- Converts semantic matches to `SearchResult` objects
- Applies 0.8x weight to semantic scores
### Score Weighting
```python
# FTS result (from BM25)
SearchResult(path="...", score=12.5, excerpt="...")
# Semantic result (fixed weighted score)
SearchResult(path="...", score=8.0, excerpt="Keywords: ...")
```
The 0.8x weight ensures semantic matches rank slightly lower than direct FTS matches
but still appear in relevant results.

View File

@@ -1,16 +0,0 @@
{"query":"executeHybridMode dense_rerank semantic smart_search","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-semantic-routing","notes":"CCW semantic mode delegates to CodexLens dense_rerank."}
{"query":"parse CodexLens JSON output strip ANSI smart_search","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-json-fallback","notes":"Covers JSON/plain-text fallback handling for CodexLens output."}
{"query":"smart_search init embed search action schema","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-action-schema","notes":"Find the Zod schema that defines init/embed/search actions."}
{"query":"auto init missing job dedupe smart_search","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-auto-init","notes":"Targets background init/embed warmup and dedupe state."}
{"query":"smart_search exact mode fallback to CodexLens fts","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-exact-fallback","notes":"Tracks the exact-mode fallback path into CodexLens FTS."}
{"query":"smart_search settings snapshot embedding backend reranker backend staged stage2 mode","relevant_paths":["ccw/src/tools/smart-search.ts"],"intent":"ccw-config-snapshot","notes":"Reads local config snapshot for embedding/reranker/staged pipeline settings."}
{"query":"embedding backend fastembed local litellm api config","relevant_paths":["codex-lens/src/codexlens/config.py"],"intent":"codexlens-embedding-config","notes":"Local-only benchmark should resolve to fastembed defaults."}
{"query":"reranker backend onnx api legacy configuration","relevant_paths":["codex-lens/src/codexlens/config.py","codex-lens/src/codexlens/env_config.py"],"intent":"codexlens-reranker-config","notes":"Covers both config dataclass fields and env overrides."}
{"query":"staged stage2 mode precomputed realtime static_global_graph","relevant_paths":["codex-lens/src/codexlens/config.py","codex-lens/src/codexlens/env_config.py"],"intent":"codexlens-stage2-config","notes":"Benchmark matrix should exercise the three supported stage2 modes."}
{"query":"enable staged rerank stage 4 config","relevant_paths":["codex-lens/src/codexlens/config.py"],"intent":"codexlens-stage4-rerank","notes":"Stage 4 rerank flag needs to stay enabled for local benchmarks."}
{"query":"cascade_search dense_rerank staged pipeline ChainSearchEngine","relevant_paths":["codex-lens/src/codexlens/search/chain_search.py"],"intent":"chain-search-cascade","notes":"Baseline query for the central retrieval engine."}
{"query":"realtime LSP expand stage2 search pipeline","relevant_paths":["codex-lens/src/codexlens/search/chain_search.py"],"intent":"chain-search-stage2-realtime","notes":"Targets realtime stage2 expansion logic."}
{"query":"static global graph stage2 expansion implementation","relevant_paths":["codex-lens/src/codexlens/search/chain_search.py"],"intent":"chain-search-stage2-static","notes":"Targets static_global_graph stage2 expansion logic."}
{"query":"cross encoder rerank stage 4 implementation","relevant_paths":["codex-lens/src/codexlens/search/chain_search.py"],"intent":"chain-search-rerank","notes":"Relevant for dense_rerank and staged rerank latency comparisons."}
{"query":"get_reranker factory onnx backend selection","relevant_paths":["codex-lens/src/codexlens/semantic/reranker/factory.py"],"intent":"reranker-factory","notes":"Keeps the benchmark aligned with local ONNX reranker selection."}
{"query":"EMBEDDING_BACKEND and RERANKER_BACKEND environment variables","relevant_paths":["codex-lens/src/codexlens/env_config.py"],"intent":"env-overrides","notes":"Covers CCW/CodexLens local-only environment overrides."}

View File

@@ -1,33 +0,0 @@
{"query":"class StandaloneLspManager","relevant_paths":["codexlens/lsp/standalone_manager.py"]}
{"query":"def _open_document","relevant_paths":["codexlens/lsp/standalone_manager.py"]}
{"query":"def _read_message","relevant_paths":["codexlens/lsp/standalone_manager.py"]}
{"query":"how does textDocument/didOpen work","relevant_paths":["codexlens/lsp/standalone_manager.py"]}
{"query":"class LspBridge","relevant_paths":["codexlens/lsp/lsp_bridge.py"]}
{"query":"def get_document_symbols","relevant_paths":["codexlens/lsp/lsp_bridge.py"]}
{"query":"class KeepAliveLspBridge","relevant_paths":["codexlens/lsp/keepalive_bridge.py"]}
{"query":"LSP keepalive bridge","relevant_paths":["codexlens/lsp/keepalive_bridge.py"]}
{"query":"class LspGraphBuilder","relevant_paths":["codexlens/lsp/lsp_graph_builder.py"]}
{"query":"def build_from_seeds","relevant_paths":["codexlens/lsp/lsp_graph_builder.py"]}
{"query":"def _stage2_realtime_lsp_expand","relevant_paths":["codexlens/search/chain_search.py"]}
{"query":"def _stage3_cluster_prune","relevant_paths":["codexlens/search/chain_search.py"]}
{"query":"def _cross_encoder_rerank","relevant_paths":["codexlens/search/chain_search.py"]}
{"query":"def dense_rerank_cascade_search","relevant_paths":["codexlens/search/chain_search.py"]}
{"query":"def cascade_search","relevant_paths":["codexlens/search/chain_search.py"]}
{"query":"def _find_nearest_binary_mmap_root","relevant_paths":["codexlens/search/chain_search.py"]}
{"query":"class BinarySearcher","relevant_paths":["codexlens/search/binary_searcher.py"]}
{"query":"class GraphExpander","relevant_paths":["codexlens/search/graph_expander.py"]}
{"query":"def cross_encoder_rerank","relevant_paths":["codexlens/search/ranking.py"]}
{"query":"def group_similar_results","relevant_paths":["codexlens/search/ranking.py"]}
{"query":"class ConfigError","relevant_paths":["codexlens/errors.py"]}
{"query":"def load_settings","relevant_paths":["codexlens/config.py"]}
{"query":"BINARY_VECTORS_MMAP_NAME","relevant_paths":["codexlens/config.py"]}
{"query":"STAGED_CLUSTERING_STRATEGY","relevant_paths":["codexlens/config.py","codexlens/env_config.py"]}
{"query":"def apply_workspace_env","relevant_paths":["codexlens/env_config.py"]}
{"query":"def generate_env_example","relevant_paths":["codexlens/env_config.py"]}
{"query":"def get_reranker","relevant_paths":["codexlens/semantic/reranker/factory.py"]}
{"query":"class APIReranker","relevant_paths":["codexlens/semantic/reranker/api_reranker.py"]}
{"query":"class RegistryStore","relevant_paths":["codexlens/storage/registry.py"]}
{"query":"class PathMapper","relevant_paths":["codexlens/storage/path_mapper.py"]}
{"query":"def lsp_status","relevant_paths":["codexlens/cli/commands.py"]}
{"query":"graph_neighbors migration","relevant_paths":["codexlens/storage/migrations/migration_007_add_graph_neighbors.py"]}
{"query":"def get_model_config","relevant_paths":["codexlens/semantic/vector_store.py"]}

View File

@@ -1,245 +0,0 @@
"""Analyze hybrid search methods contribution."""
import json
import sqlite3
import time
from pathlib import Path
from collections import defaultdict
import sys
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from codexlens.search.hybrid_search import HybridSearchEngine
from codexlens.search.ranking import (
reciprocal_rank_fusion,
cross_encoder_rerank,
DEFAULT_WEIGHTS,
)
# Use index with most data
index_path = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens\src\codexlens\storage\_index.db")
print("=" * 60)
print("1. STORAGE ARCHITECTURE ANALYSIS")
print("=" * 60)
# Analyze storage
with sqlite3.connect(index_path) as conn:
cursor = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
)
tables = [row[0] for row in cursor.fetchall()]
print("\nTable Overview:")
for table in tables:
try:
count = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
if count > 0:
print(f" {table}: {count} rows")
except:
pass
print("\n--- Conflict Analysis ---")
chunks_count = 0
semantic_count = 0
if "chunks" in tables:
chunks_count = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
if "semantic_chunks" in tables:
semantic_count = conn.execute("SELECT COUNT(*) FROM semantic_chunks").fetchone()[0]
print(f" chunks table: {chunks_count} rows")
print(f" semantic_chunks table: {semantic_count} rows")
if semantic_count > 0:
col_info = conn.execute("PRAGMA table_info(semantic_chunks)").fetchall()
col_names = [c[1] for c in col_info]
print(f"\n semantic_chunks columns: {col_names}")
for col in ["embedding", "embedding_binary", "embedding_dense"]:
if col in col_names:
null_count = conn.execute(
f"SELECT COUNT(*) FROM semantic_chunks WHERE {col} IS NULL"
).fetchone()[0]
non_null = semantic_count - null_count
print(f" {col}: {non_null}/{semantic_count} non-null")
print("\n" + "=" * 60)
print("2. METHOD CONTRIBUTION ANALYSIS")
print("=" * 60)
queries = [
"database connection",
"create table",
"sqlite store",
"migration",
"search chunks",
]
results_summary = {
"fts_exact": [],
"fts_fuzzy": [],
"vector": [],
}
for query in queries:
print(f"\nQuery: '{query}'")
# FTS Exact
try:
engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS)
engine._config = type("obj", (object,), {
"use_fts_fallback": True,
"embedding_use_gpu": True,
"symbol_boost_factor": 1.5,
"enable_reranking": False,
})()
start = time.perf_counter()
results = engine.search(index_path, query, limit=10, enable_fuzzy=False, enable_vector=False)
latency = (time.perf_counter() - start) * 1000
results_summary["fts_exact"].append({"count": len(results), "latency": latency})
top_file = results[0].path.split("\\")[-1] if results else "N/A"
top_score = results[0].score if results else 0
print(f" FTS Exact: {len(results)} results, {latency:.1f}ms, top: {top_file} ({top_score:.3f})")
except Exception as e:
print(f" FTS Exact: ERROR - {e}")
# FTS Fuzzy
try:
engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS)
engine._config = type("obj", (object,), {
"use_fts_fallback": True,
"embedding_use_gpu": True,
"symbol_boost_factor": 1.5,
"enable_reranking": False,
})()
start = time.perf_counter()
results = engine.search(index_path, query, limit=10, enable_fuzzy=True, enable_vector=False)
latency = (time.perf_counter() - start) * 1000
results_summary["fts_fuzzy"].append({"count": len(results), "latency": latency})
top_file = results[0].path.split("\\")[-1] if results else "N/A"
top_score = results[0].score if results else 0
print(f" FTS Fuzzy: {len(results)} results, {latency:.1f}ms, top: {top_file} ({top_score:.3f})")
except Exception as e:
print(f" FTS Fuzzy: ERROR - {e}")
# Vector
try:
engine = HybridSearchEngine()
engine._config = type("obj", (object,), {
"use_fts_fallback": False,
"embedding_use_gpu": True,
"symbol_boost_factor": 1.5,
"enable_reranking": False,
})()
start = time.perf_counter()
results = engine.search(index_path, query, limit=10, enable_vector=True, pure_vector=True)
latency = (time.perf_counter() - start) * 1000
results_summary["vector"].append({"count": len(results), "latency": latency})
top_file = results[0].path.split("\\")[-1] if results else "N/A"
top_score = results[0].score if results else 0
print(f" Vector: {len(results)} results, {latency:.1f}ms, top: {top_file} ({top_score:.3f})")
except Exception as e:
print(f" Vector: ERROR - {e}")
print("\n--- Summary ---")
for method, data in results_summary.items():
if data:
avg_count = sum(d["count"] for d in data) / len(data)
avg_latency = sum(d["latency"] for d in data) / len(data)
print(f"{method}: avg {avg_count:.1f} results, {avg_latency:.1f}ms")
print("\n" + "=" * 60)
print("3. FTS + RERANK FUSION EXPERIMENT")
print("=" * 60)
# Initialize reranker
reranker = None
try:
from codexlens.semantic.reranker import get_reranker, check_reranker_available
ok, _ = check_reranker_available("onnx")
if ok:
reranker = get_reranker(backend="onnx", use_gpu=True)
print("\nReranker loaded: ONNX backend")
except Exception as e:
print(f"\nReranker unavailable: {e}")
test_queries = ["database connection", "create table migration"]
for query in test_queries:
print(f"\nQuery: '{query}'")
# Strategy 1: Standard Hybrid (FTS exact+fuzzy RRF)
try:
engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS)
engine._config = type("obj", (object,), {
"use_fts_fallback": True,
"embedding_use_gpu": True,
"symbol_boost_factor": 1.5,
"enable_reranking": False,
})()
start = time.perf_counter()
standard_results = engine.search(index_path, query, limit=10, enable_fuzzy=True, enable_vector=False)
standard_latency = (time.perf_counter() - start) * 1000
print(f" Standard FTS RRF: {len(standard_results)} results, {standard_latency:.1f}ms")
for i, r in enumerate(standard_results[:3]):
print(f" {i+1}. {r.path.split(chr(92))[-1]} (score: {r.score:.4f})")
except Exception as e:
print(f" Standard FTS RRF: ERROR - {e}")
standard_results = []
# Strategy 2: FTS + CrossEncoder Rerank
if reranker and standard_results:
try:
start = time.perf_counter()
reranked_results = cross_encoder_rerank(query, standard_results, reranker, top_k=10)
rerank_latency = (time.perf_counter() - start) * 1000
print(f" FTS + Rerank: {len(reranked_results)} results, {rerank_latency:.1f}ms (rerank only)")
for i, r in enumerate(reranked_results[:3]):
ce_score = r.metadata.get("cross_encoder_prob", r.score)
print(f" {i+1}. {r.path.split(chr(92))[-1]} (CE prob: {ce_score:.4f})")
# Compare rankings
standard_order = [r.path.split("\\")[-1] for r in standard_results[:5]]
reranked_order = [r.path.split("\\")[-1] for r in reranked_results[:5]]
if standard_order != reranked_order:
print(f" Ranking changed!")
print(f" Before: {standard_order}")
print(f" After: {reranked_order}")
else:
print(f" Ranking unchanged")
except Exception as e:
print(f" FTS + Rerank: ERROR - {e}")
print("\n" + "=" * 60)
print("CONCLUSIONS")
print("=" * 60)
print("""
1. Storage Architecture:
- semantic_chunks: Used by cascade-index (binary+dense vectors)
- chunks: Used by legacy SQLiteStore (currently empty in this index)
- files_fts_*: Used by FTS exact/fuzzy search
CONFLICT: binary_cascade_search reads from semantic_chunks,
but standard FTS reads from files table. These are SEPARATE paths.
2. Method Contributions:
- FTS: Fast but limited to keyword matching
- Vector: Semantic understanding but requires embeddings
3. FTS + Rerank Fusion:
- CrossEncoder reranking can improve precision
- Adds ~100-200ms latency per query
- Most effective when initial FTS recall is good
""")

View File

@@ -1,209 +0,0 @@
#!/usr/bin/env python
"""Micro-benchmark for BinaryANNIndex search performance.
Measures the actual speedup of vectorized Hamming distance computation.
"""
from __future__ import annotations
import gc
import statistics
import sys
import time
from pathlib import Path
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
import numpy as np
def old_search_implementation(query_arr: np.ndarray, vectors: dict, id_list: list, top_k: int):
"""Original O(N) loop-based implementation for comparison."""
packed_dim = len(query_arr)
distances = []
for vec_id in id_list:
vec = vectors[vec_id]
vec_arr = np.frombuffer(vec, dtype=np.uint8)
xor = np.bitwise_xor(query_arr, vec_arr)
dist = int(np.unpackbits(xor).sum())
distances.append((vec_id, dist))
distances.sort(key=lambda x: x[1])
top_results = distances[:top_k]
ids = [r[0] for r in top_results]
dists = [r[1] for r in top_results]
return ids, dists
def new_search_implementation(query_arr: np.ndarray, vectors_matrix: np.ndarray, ids_array: np.ndarray, top_k: int):
"""Optimized vectorized implementation."""
# Broadcast XOR
xor_result = np.bitwise_xor(query_arr, vectors_matrix)
# Vectorized popcount using lookup table
popcount_lut = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8)
bit_counts = popcount_lut[xor_result]
# Sum across packed bytes
distances = bit_counts.sum(axis=1)
# Get top-k using argpartition
n_vectors = len(distances)
k = min(top_k, n_vectors)
if k == n_vectors:
sorted_indices = np.argsort(distances)
else:
partition_indices = np.argpartition(distances, k)[:k]
top_k_distances = distances[partition_indices]
sorted_order = np.argsort(top_k_distances)
sorted_indices = partition_indices[sorted_order]
result_ids = ids_array[sorted_indices].tolist()
result_dists = distances[sorted_indices].tolist()
return result_ids, result_dists
def run_benchmark(n_vectors: int, dim: int = 256, top_k: int = 100, n_iterations: int = 50):
"""Run benchmark comparing old and new implementations."""
packed_dim = dim // 8 # 32 bytes for 256-bit
print(f"\n{'='*60}")
print(f"Binary Search Micro-Benchmark")
print(f"{'='*60}")
print(f"Vectors: {n_vectors}")
print(f"Dimension: {dim} bits ({packed_dim} bytes packed)")
print(f"Top-K: {top_k}")
print(f"Iterations: {n_iterations}")
print(f"{'='*60}\n")
# Generate random binary vectors
print("Generating test data...")
vectors_dict = {}
id_list = []
for i in range(n_vectors):
vec_bytes = np.random.randint(0, 256, size=packed_dim, dtype=np.uint8).tobytes()
vectors_dict[i] = vec_bytes
id_list.append(i)
# Build matrix for vectorized search
vectors_matrix = np.empty((n_vectors, packed_dim), dtype=np.uint8)
ids_array = np.array(id_list, dtype=np.int64)
for i, vec_id in enumerate(id_list):
vec_bytes = vectors_dict[vec_id]
vectors_matrix[i] = np.frombuffer(vec_bytes, dtype=np.uint8)
# Generate random query
query_bytes = np.random.randint(0, 256, size=packed_dim, dtype=np.uint8).tobytes()
query_arr = np.frombuffer(query_bytes, dtype=np.uint8)
# Warmup
print("Running warmup...")
for _ in range(3):
old_search_implementation(query_arr, vectors_dict, id_list, top_k)
new_search_implementation(query_arr, vectors_matrix, ids_array, top_k)
# Benchmark old implementation
print("Benchmarking old implementation...")
old_times = []
for _ in range(n_iterations):
gc.collect()
start = time.perf_counter()
old_ids, old_dists = old_search_implementation(query_arr, vectors_dict, id_list, top_k)
elapsed = (time.perf_counter() - start) * 1000
old_times.append(elapsed)
# Benchmark new implementation
print("Benchmarking new implementation...")
new_times = []
for _ in range(n_iterations):
gc.collect()
start = time.perf_counter()
new_ids, new_dists = new_search_implementation(query_arr, vectors_matrix, ids_array, top_k)
elapsed = (time.perf_counter() - start) * 1000
new_times.append(elapsed)
# Verify correctness
print("\nVerifying correctness...")
# Check that distances are correct (IDs may differ for ties)
if old_dists == new_dists:
print("Distances match! (IDs may differ for ties)")
else:
# Check if difference is just in tie-breaking
old_dist_set = set(old_dists)
new_dist_set = set(new_dists)
if old_dist_set == new_dist_set:
print("Distances equivalent (tie-breaking differs, which is acceptable)")
else:
print("WARNING: Distance distributions differ!")
print(f" Old dists (first 5): {old_dists[:5]}")
print(f" New dists (first 5): {new_dists[:5]}")
# Calculate statistics
old_avg = statistics.mean(old_times)
old_std = statistics.stdev(old_times) if len(old_times) > 1 else 0
new_avg = statistics.mean(new_times)
new_std = statistics.stdev(new_times) if len(new_times) > 1 else 0
speedup = old_avg / new_avg if new_avg > 0 else 0
# Print results
print(f"\n{'='*60}")
print("RESULTS")
print(f"{'='*60}")
print(f"{'Metric':<25} {'Old (loop)':>15} {'New (vectorized)':>18}")
print(f"{'-'*25} {'-'*15} {'-'*18}")
print(f"{'Avg Latency (ms)':<25} {old_avg:>15.3f} {new_avg:>18.3f}")
print(f"{'Std Dev (ms)':<25} {old_std:>15.3f} {new_std:>18.3f}")
print(f"{'Min Latency (ms)':<25} {min(old_times):>15.3f} {min(new_times):>18.3f}")
print(f"{'Max Latency (ms)':<25} {max(old_times):>15.3f} {max(new_times):>18.3f}")
print(f"{'P50 (ms)':<25} {sorted(old_times)[len(old_times)//2]:>15.3f} {sorted(new_times)[len(new_times)//2]:>18.3f}")
print(f"\n{'Speedup:':<25} {speedup:>15.2f}x")
print(f"{'='*60}\n")
return {
"n_vectors": n_vectors,
"dim": dim,
"top_k": top_k,
"old_avg_ms": old_avg,
"new_avg_ms": new_avg,
"speedup": speedup,
}
def main():
print("\n" + "="*70)
print(" BINARY SEARCH OPTIMIZATION MICRO-BENCHMARK")
print("="*70)
# Test different vector counts
results = []
for n_vectors in [1000, 5000, 10000, 50000]:
result = run_benchmark(
n_vectors=n_vectors,
dim=256,
top_k=100,
n_iterations=20,
)
results.append(result)
# Summary
print("\n" + "="*70)
print(" SUMMARY")
print("="*70)
print(f"{'N Vectors':<12} {'Old (ms)':<12} {'New (ms)':<12} {'Speedup':>10}")
print("-"*50)
for r in results:
print(f"{r['n_vectors']:<12} {r['old_avg_ms']:<12.3f} {r['new_avg_ms']:<12.3f} {r['speedup']:>10.2f}x")
print("="*70)
if __name__ == "__main__":
main()

View File

@@ -1,402 +0,0 @@
#!/usr/bin/env python
"""Benchmark script for comparing cascade search strategies.
Compares:
- binary: 256-dim binary coarse ranking + 2048-dim dense fine ranking
- hybrid: FTS+Vector coarse ranking + CrossEncoder fine ranking
Usage:
python benchmarks/cascade_benchmark.py [--source PATH] [--queries N] [--warmup N]
"""
from __future__ import annotations
import argparse
import gc
import json
import os
import statistics
import sys
import time
import traceback
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import List, Optional, Dict, Any
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
from codexlens.config import Config
from codexlens.storage.registry import RegistryStore
from codexlens.storage.path_mapper import PathMapper
@dataclass
class BenchmarkResult:
"""Result from a single benchmark run."""
strategy: str
query: str
latency_ms: float
num_results: int
top_result: Optional[str]
error: Optional[str] = None
@dataclass
class BenchmarkSummary:
"""Aggregated benchmark statistics."""
strategy: str
total_queries: int
successful_queries: int
avg_latency_ms: float
min_latency_ms: float
max_latency_ms: float
p50_latency_ms: float
p95_latency_ms: float
p99_latency_ms: float
avg_results: float
errors: List[str]
# Default test queries covering different scenarios
DEFAULT_QUERIES = [
# Code patterns
"def search",
"class Engine",
"import numpy",
"async def",
"raise ValueError",
# Semantic queries
"how to parse json",
"database connection",
"error handling",
"authentication logic",
"file read write",
# Technical terms
"embedding vector",
"cosine similarity",
"binary quantization",
"hamming distance",
"reranking",
]
def percentile(data: List[float], p: float) -> float:
"""Calculate percentile of sorted data."""
if not data:
return 0.0
sorted_data = sorted(data)
k = (len(sorted_data) - 1) * (p / 100)
f = int(k)
c = f + 1 if f + 1 < len(sorted_data) else f
return sorted_data[f] + (k - f) * (sorted_data[c] - sorted_data[f])
def run_single_benchmark(
engine: ChainSearchEngine,
query: str,
source_path: Path,
strategy: str,
options: Optional[SearchOptions] = None,
) -> BenchmarkResult:
"""Run a single benchmark query."""
gc.collect()
start_time = time.perf_counter()
try:
result = engine.cascade_search(
query=query,
source_path=source_path,
k=10,
coarse_k=100,
options=options,
strategy=strategy,
)
elapsed_ms = (time.perf_counter() - start_time) * 1000
top_result = None
if result.results:
r = result.results[0]
line = r.start_line or 0
top_result = f"{r.path}:{line}"
return BenchmarkResult(
strategy=strategy,
query=query,
latency_ms=elapsed_ms,
num_results=len(result.results),
top_result=top_result,
)
except Exception as e:
elapsed_ms = (time.perf_counter() - start_time) * 1000
return BenchmarkResult(
strategy=strategy,
query=query,
latency_ms=elapsed_ms,
num_results=0,
top_result=None,
error=str(e),
)
def run_benchmarks(
source_path: Path,
queries: List[str],
strategies: List[str],
warmup_runs: int = 2,
options: Optional[SearchOptions] = None,
) -> Dict[str, List[BenchmarkResult]]:
"""Run benchmarks for all queries and strategies."""
print(f"\n{'='*60}")
print(f"Cascade Search Benchmark")
print(f"{'='*60}")
print(f"Source: {source_path}")
print(f"Queries: {len(queries)}")
print(f"Strategies: {strategies}")
print(f"Warmup runs: {warmup_runs}")
print(f"{'='*60}\n")
# Initialize engine
config = Config()
registry = RegistryStore() # Uses default path
registry.initialize()
mapper = PathMapper() # Uses default path
engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config)
results: Dict[str, List[BenchmarkResult]] = {s: [] for s in strategies}
# Warmup phase
if warmup_runs > 0:
print(f"Running {warmup_runs} warmup queries...")
warmup_query = queries[0] if queries else "test"
for strategy in strategies:
for _ in range(warmup_runs):
try:
run_single_benchmark(engine, warmup_query, source_path, strategy, options)
except Exception:
pass
print("Warmup complete.\n")
# Benchmark phase
total_runs = len(queries) * len(strategies)
current_run = 0
for query in queries:
for strategy in strategies:
current_run += 1
print(f"[{current_run}/{total_runs}] {strategy}: '{query[:40]}...' ", end="", flush=True)
result = run_single_benchmark(engine, query, source_path, strategy, options)
results[strategy].append(result)
if result.error:
print(f"ERROR: {result.error[:50]}")
else:
print(f"{result.latency_ms:.1f}ms, {result.num_results} results")
return results
def summarize_results(results: Dict[str, List[BenchmarkResult]]) -> Dict[str, BenchmarkSummary]:
"""Generate summary statistics for each strategy."""
summaries = {}
for strategy, benchmark_results in results.items():
latencies = [r.latency_ms for r in benchmark_results if r.error is None]
result_counts = [r.num_results for r in benchmark_results if r.error is None]
errors = [r.error for r in benchmark_results if r.error is not None]
if latencies:
summary = BenchmarkSummary(
strategy=strategy,
total_queries=len(benchmark_results),
successful_queries=len(latencies),
avg_latency_ms=statistics.mean(latencies),
min_latency_ms=min(latencies),
max_latency_ms=max(latencies),
p50_latency_ms=percentile(latencies, 50),
p95_latency_ms=percentile(latencies, 95),
p99_latency_ms=percentile(latencies, 99),
avg_results=statistics.mean(result_counts) if result_counts else 0,
errors=errors,
)
else:
summary = BenchmarkSummary(
strategy=strategy,
total_queries=len(benchmark_results),
successful_queries=0,
avg_latency_ms=0,
min_latency_ms=0,
max_latency_ms=0,
p50_latency_ms=0,
p95_latency_ms=0,
p99_latency_ms=0,
avg_results=0,
errors=errors,
)
summaries[strategy] = summary
return summaries
def print_comparison_table(summaries: Dict[str, BenchmarkSummary]) -> None:
"""Print formatted comparison table."""
print(f"\n{'='*80}")
print("BENCHMARK RESULTS COMPARISON")
print(f"{'='*80}\n")
# Header
print(f"{'Metric':<25} {'Binary':>15} {'Hybrid':>15} {'Diff':>15}")
print(f"{'-'*25} {'-'*15} {'-'*15} {'-'*15}")
binary = summaries.get("binary")
hybrid = summaries.get("hybrid")
if not binary or not hybrid:
print("Missing results for comparison")
return
metrics = [
("Total Queries", binary.total_queries, hybrid.total_queries),
("Successful", binary.successful_queries, hybrid.successful_queries),
("Avg Latency (ms)", binary.avg_latency_ms, hybrid.avg_latency_ms),
("Min Latency (ms)", binary.min_latency_ms, hybrid.min_latency_ms),
("Max Latency (ms)", binary.max_latency_ms, hybrid.max_latency_ms),
("P50 Latency (ms)", binary.p50_latency_ms, hybrid.p50_latency_ms),
("P95 Latency (ms)", binary.p95_latency_ms, hybrid.p95_latency_ms),
("P99 Latency (ms)", binary.p99_latency_ms, hybrid.p99_latency_ms),
("Avg Results", binary.avg_results, hybrid.avg_results),
]
for name, b_val, h_val in metrics:
if isinstance(b_val, float):
diff = b_val - h_val
diff_str = f"{diff:+.2f}" if diff != 0 else "0.00"
speedup = h_val / b_val if b_val > 0 else 0
if "Latency" in name and speedup > 1:
diff_str += f" ({speedup:.1f}x faster)"
print(f"{name:<25} {b_val:>15.2f} {h_val:>15.2f} {diff_str:>15}")
else:
diff = b_val - h_val
print(f"{name:<25} {b_val:>15} {h_val:>15} {diff:>+15}")
# Errors
print(f"\n{'Errors:':<25}")
print(f" Binary: {len(binary.errors)}")
for err in binary.errors[:3]:
print(f" - {err[:60]}...")
print(f" Hybrid: {len(hybrid.errors)}")
for err in hybrid.errors[:3]:
print(f" - {err[:60]}...")
# Winner
print(f"\n{'='*80}")
if binary.avg_latency_ms < hybrid.avg_latency_ms and binary.successful_queries > 0:
speedup = hybrid.avg_latency_ms / binary.avg_latency_ms
print(f"[WINNER] Binary ({speedup:.2f}x faster average latency)")
elif hybrid.avg_latency_ms < binary.avg_latency_ms and hybrid.successful_queries > 0:
speedup = binary.avg_latency_ms / hybrid.avg_latency_ms
print(f"[WINNER] Hybrid ({speedup:.2f}x faster average latency)")
else:
print("No clear winner (check errors)")
print(f"{'='*80}\n")
def save_results(
results: Dict[str, List[BenchmarkResult]],
summaries: Dict[str, BenchmarkSummary],
output_path: Path,
) -> None:
"""Save benchmark results to JSON file."""
data = {
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"summaries": {k: asdict(v) for k, v in summaries.items()},
"details": {
k: [asdict(r) for r in v]
for k, v in results.items()
},
}
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
print(f"Results saved to: {output_path}")
def main():
parser = argparse.ArgumentParser(description="Benchmark cascade search strategies")
parser.add_argument(
"--source", "-s",
type=Path,
default=Path(__file__).parent.parent / "src",
help="Source directory to search (default: ./src)",
)
parser.add_argument(
"--queries", "-q",
type=int,
default=len(DEFAULT_QUERIES),
help=f"Number of queries to run (default: {len(DEFAULT_QUERIES)})",
)
parser.add_argument(
"--warmup", "-w",
type=int,
default=2,
help="Number of warmup runs (default: 2)",
)
parser.add_argument(
"--output", "-o",
type=Path,
default=Path(__file__).parent / "results" / "cascade_benchmark.json",
help="Output file for results (default: benchmarks/results/cascade_benchmark.json)",
)
parser.add_argument(
"--strategies",
nargs="+",
default=["binary", "hybrid"],
choices=["binary", "hybrid"],
help="Strategies to benchmark (default: both)",
)
args = parser.parse_args()
# Validate source path
if not args.source.exists():
print(f"Error: Source path does not exist: {args.source}")
sys.exit(1)
# Select queries
queries = DEFAULT_QUERIES[:args.queries]
# Run benchmarks
try:
results = run_benchmarks(
source_path=args.source,
queries=queries,
strategies=args.strategies,
warmup_runs=args.warmup,
)
# Generate summaries
summaries = summarize_results(results)
# Print comparison
print_comparison_table(summaries)
# Save results
save_results(results, summaries, args.output)
except KeyboardInterrupt:
print("\nBenchmark interrupted.")
sys.exit(1)
except Exception as e:
print(f"\nBenchmark failed: {e}")
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -1,365 +0,0 @@
#!/usr/bin/env python
"""Compare labeled accuracy: staged(realtime LSP graph) vs dense_rerank.
This script measures retrieval "accuracy" against a labeled query set.
Each query must provide a list of relevant file paths (relative to --source
or absolute). We report:
- Hit@K (any relevant file appears in top-K)
- MRR@K (reciprocal rank of first relevant file within top-K)
- Recall@K (fraction of relevant files present in top-K)
Example:
python benchmarks/compare_accuracy_labeled.py --source ./src
python benchmarks/compare_accuracy_labeled.py --queries-file benchmarks/accuracy_queries_codexlens.jsonl
"""
from __future__ import annotations
import argparse
import gc
import json
import os
import re
import statistics
import sys
import time
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
# Add src to path (match other benchmark scripts)
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from codexlens.config import Config
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
from codexlens.storage.path_mapper import PathMapper
from codexlens.storage.registry import RegistryStore
DEFAULT_QUERIES_FILE = Path(__file__).parent / "accuracy_queries_codexlens.jsonl"
def _now_ms() -> float:
return time.perf_counter() * 1000.0
def _normalize_path_key(path: str) -> str:
"""Normalize file paths for overlap/dedup metrics (Windows-safe)."""
try:
p = Path(path)
# Don't explode on non-files like "<memory>".
if str(p) and (p.is_absolute() or re.match(r"^[A-Za-z]:", str(p))):
norm = str(p.resolve())
else:
norm = str(p)
except Exception:
norm = path
norm = norm.replace("/", "\\")
if os.name == "nt":
norm = norm.lower()
return norm
def _load_labeled_queries(path: Path, limit: Optional[int]) -> List[Dict[str, Any]]:
if not path.is_file():
raise SystemExit(f"Queries file does not exist: {path}")
out: List[Dict[str, Any]] = []
for raw_line in path.read_text(encoding="utf-8", errors="ignore").splitlines():
line = raw_line.strip()
if not line or line.startswith("#"):
continue
try:
item = json.loads(line)
except Exception as exc:
raise SystemExit(f"Invalid JSONL line in {path}: {raw_line!r} ({exc})") from exc
if not isinstance(item, dict) or "query" not in item:
raise SystemExit(f"Invalid query item (expected object with 'query'): {item!r}")
out.append(item)
if limit is not None and len(out) >= limit:
break
return out
def _dedup_topk(paths: Iterable[str], k: int) -> List[str]:
out: List[str] = []
seen: set[str] = set()
for p in paths:
if p in seen:
continue
seen.add(p)
out.append(p)
if len(out) >= k:
break
return out
def _first_hit_rank(topk_paths: Sequence[str], relevant: set[str]) -> Optional[int]:
for i, p in enumerate(topk_paths, start=1):
if p in relevant:
return i
return None
@dataclass
class StrategyRun:
strategy: str
latency_ms: float
topk_paths: List[str]
first_hit_rank: Optional[int]
hit_at_k: bool
recall_at_k: float
error: Optional[str] = None
@dataclass
class QueryEval:
query: str
relevant_paths: List[str]
staged: StrategyRun
dense_rerank: StrategyRun
def _run_strategy(
engine: ChainSearchEngine,
*,
strategy: str,
query: str,
source_path: Path,
k: int,
coarse_k: int,
relevant: set[str],
options: Optional[SearchOptions] = None,
) -> StrategyRun:
gc.collect()
start_ms = _now_ms()
try:
result = engine.cascade_search(
query=query,
source_path=source_path,
k=k,
coarse_k=coarse_k,
options=options,
strategy=strategy,
)
latency_ms = _now_ms() - start_ms
paths_raw = [r.path for r in (result.results or []) if getattr(r, "path", None)]
paths_norm = [_normalize_path_key(p) for p in paths_raw]
topk = _dedup_topk(paths_norm, k=k)
rank = _first_hit_rank(topk, relevant)
hit = rank is not None
recall = 0.0
if relevant:
recall = len(set(topk) & relevant) / float(len(relevant))
return StrategyRun(
strategy=strategy,
latency_ms=latency_ms,
topk_paths=topk,
first_hit_rank=rank,
hit_at_k=hit,
recall_at_k=recall,
error=None,
)
except Exception as exc:
latency_ms = _now_ms() - start_ms
return StrategyRun(
strategy=strategy,
latency_ms=latency_ms,
topk_paths=[],
first_hit_rank=None,
hit_at_k=False,
recall_at_k=0.0,
error=repr(exc),
)
def _mrr(ranks: Sequence[Optional[int]]) -> float:
vals = []
for r in ranks:
if r is None or r <= 0:
vals.append(0.0)
else:
vals.append(1.0 / float(r))
return statistics.mean(vals) if vals else 0.0
def main() -> None:
parser = argparse.ArgumentParser(
description="Compare labeled retrieval accuracy: staged(realtime) vs dense_rerank"
)
parser.add_argument(
"--source",
type=Path,
default=Path(__file__).parent.parent / "src",
help="Source directory to search (default: ./src)",
)
parser.add_argument(
"--queries-file",
type=Path,
default=DEFAULT_QUERIES_FILE,
help="JSONL file with {query, relevant_paths[]} per line",
)
parser.add_argument("--queries", type=int, default=None, help="Limit number of queries")
parser.add_argument("--k", type=int, default=10, help="Top-K for evaluation (default 10)")
parser.add_argument("--coarse-k", type=int, default=100, help="Coarse candidates (default 100)")
parser.add_argument(
"--staged-cluster-strategy",
type=str,
default="path",
help="Config.staged_clustering_strategy override for staged (default: path)",
)
parser.add_argument(
"--stage2-mode",
type=str,
default="realtime",
help="Config.staged_stage2_mode override for staged (default: realtime)",
)
parser.add_argument(
"--output",
type=Path,
default=Path(__file__).parent / "results" / "accuracy_labeled.json",
help="Output JSON path",
)
args = parser.parse_args()
if not args.source.exists():
raise SystemExit(f"Source path does not exist: {args.source}")
labeled = _load_labeled_queries(args.queries_file, args.queries)
if not labeled:
raise SystemExit("No queries to run")
source_root = args.source.expanduser().resolve()
# Match CLI behavior: load settings + apply global/workspace .env overrides.
config = Config.load()
config.cascade_strategy = "staged"
config.staged_stage2_mode = str(args.stage2_mode or "realtime").strip().lower()
config.enable_staged_rerank = True
config.staged_clustering_strategy = str(args.staged_cluster_strategy or "path").strip().lower()
# Stability: on some Windows setups, DirectML/ONNX can crash under load.
config.embedding_use_gpu = False
config.reranker_use_gpu = False
registry = RegistryStore()
registry.initialize()
mapper = PathMapper()
engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config)
def resolve_expected(paths: Sequence[str]) -> set[str]:
out: set[str] = set()
for p in paths:
try:
cand = Path(p)
if not cand.is_absolute():
cand = (source_root / cand).resolve()
out.add(_normalize_path_key(str(cand)))
except Exception:
out.add(_normalize_path_key(p))
return out
evaluations: List[QueryEval] = []
try:
for i, item in enumerate(labeled, start=1):
query = str(item.get("query", "")).strip()
relevant_raw = item.get("relevant_paths") or []
if not query:
continue
if not isinstance(relevant_raw, list) or not relevant_raw:
raise SystemExit(f"Query item missing relevant_paths[]: {item!r}")
relevant = resolve_expected([str(p) for p in relevant_raw])
print(f"[{i}/{len(labeled)}] {query}")
staged = _run_strategy(
engine,
strategy="staged",
query=query,
source_path=source_root,
k=int(args.k),
coarse_k=int(args.coarse_k),
relevant=relevant,
options=None,
)
dense = _run_strategy(
engine,
strategy="dense_rerank",
query=query,
source_path=source_root,
k=int(args.k),
coarse_k=int(args.coarse_k),
relevant=relevant,
options=None,
)
evaluations.append(
QueryEval(
query=query,
relevant_paths=[_normalize_path_key(str((source_root / p).resolve())) if not Path(p).is_absolute() else _normalize_path_key(p) for p in relevant_raw],
staged=staged,
dense_rerank=dense,
)
)
finally:
try:
engine.close()
except Exception:
pass
try:
registry.close()
except Exception:
pass
staged_runs = [e.staged for e in evaluations]
dense_runs = [e.dense_rerank for e in evaluations]
def mean(xs: Sequence[float]) -> float:
return statistics.mean(xs) if xs else 0.0
staged_ranks = [r.first_hit_rank for r in staged_runs]
dense_ranks = [r.first_hit_rank for r in dense_runs]
summary = {
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"source": str(source_root),
"queries_file": str(args.queries_file),
"query_count": len(evaluations),
"k": int(args.k),
"coarse_k": int(args.coarse_k),
"staged": {
"hit_at_k": mean([1.0 if r.hit_at_k else 0.0 for r in staged_runs]),
"mrr_at_k": _mrr(staged_ranks),
"avg_recall_at_k": mean([r.recall_at_k for r in staged_runs]),
"avg_latency_ms": mean([r.latency_ms for r in staged_runs if not r.error]),
"errors": sum(1 for r in staged_runs if r.error),
},
"dense_rerank": {
"hit_at_k": mean([1.0 if r.hit_at_k else 0.0 for r in dense_runs]),
"mrr_at_k": _mrr(dense_ranks),
"avg_recall_at_k": mean([r.recall_at_k for r in dense_runs]),
"avg_latency_ms": mean([r.latency_ms for r in dense_runs if not r.error]),
"errors": sum(1 for r in dense_runs if r.error),
},
"config": {
"staged_stage2_mode": config.staged_stage2_mode,
"staged_clustering_strategy": config.staged_clustering_strategy,
"enable_staged_rerank": bool(config.enable_staged_rerank),
"reranker_backend": config.reranker_backend,
"reranker_model": config.reranker_model,
"embedding_backend": config.embedding_backend,
"embedding_model": config.embedding_model,
},
}
payload = {"summary": summary, "evaluations": [asdict(e) for e in evaluations]}
args.output.parent.mkdir(parents=True, exist_ok=True)
args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
print("\n=== SUMMARY ===")
print(json.dumps(summary, indent=2))
print(f"\nSaved: {args.output}")
if __name__ == "__main__":
main()

View File

@@ -1,980 +0,0 @@
#!/usr/bin/env python
"""Benchmark local-only staged stage2 modes for CCW smart_search queries.
This benchmark reuses the existing CodexLens benchmark style, but focuses on
the real search intents that drive CCW `smart_search`. It evaluates:
1. `dense_rerank` baseline
2. `staged` + `precomputed`
3. `staged` + `realtime`
4. `staged` + `static_global_graph`
Metrics:
- Hit@K
- MRR@K
- Recall@K
- latency (avg/p50/p95)
The runner is intentionally local-only. By default it uses:
- embedding backend: `fastembed`
- reranker backend: `onnx`
Examples:
python benchmarks/compare_ccw_smart_search_stage2.py --dry-run
python benchmarks/compare_ccw_smart_search_stage2.py --self-check
python benchmarks/compare_ccw_smart_search_stage2.py --source .. --k 10
python benchmarks/compare_ccw_smart_search_stage2.py --embedding-model code --reranker-model cross-encoder/ms-marco-MiniLM-L-6-v2
"""
from __future__ import annotations
import argparse
from copy import deepcopy
import gc
import json
import os
import re
import statistics
import sys
import time
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from codexlens.config import Config
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
from codexlens.search.ranking import (
QueryIntent,
detect_query_intent,
is_generated_artifact_path,
is_test_file,
query_prefers_lexical_search,
query_targets_generated_files,
)
from codexlens.storage.path_mapper import PathMapper
from codexlens.storage.registry import RegistryStore
DEFAULT_SOURCE = Path(__file__).resolve().parents[2]
DEFAULT_QUERIES_FILE = Path(__file__).parent / "accuracy_queries_ccw_smart_search.jsonl"
DEFAULT_OUTPUT = Path(__file__).parent / "results" / "ccw_smart_search_stage2.json"
VALID_STAGE2_MODES = ("precomputed", "realtime", "static_global_graph")
VALID_LOCAL_EMBEDDING_BACKENDS = ("fastembed",)
VALID_LOCAL_RERANKER_BACKENDS = ("onnx", "fastembed", "legacy")
VALID_BASELINE_METHODS = ("auto", "fts", "hybrid")
DEFAULT_LOCAL_ONNX_RERANKER_MODEL = "Xenova/ms-marco-MiniLM-L-6-v2"
def _now_ms() -> float:
return time.perf_counter() * 1000.0
def _normalize_path_key(path: str) -> str:
try:
candidate = Path(path)
if str(candidate) and (candidate.is_absolute() or re.match(r"^[A-Za-z]:", str(candidate))):
normalized = str(candidate.resolve())
else:
normalized = str(candidate)
except Exception:
normalized = path
normalized = normalized.replace("/", "\\")
if os.name == "nt":
normalized = normalized.lower()
return normalized
def _dedup_topk(paths: Iterable[str], k: int) -> List[str]:
output: List[str] = []
seen: set[str] = set()
for path in paths:
if path in seen:
continue
seen.add(path)
output.append(path)
if len(output) >= k:
break
return output
def _first_hit_rank(topk_paths: Sequence[str], relevant: set[str]) -> Optional[int]:
for index, path in enumerate(topk_paths, start=1):
if path in relevant:
return index
return None
def _mrr(ranks: Sequence[Optional[int]]) -> float:
values = [1.0 / rank for rank in ranks if rank and rank > 0]
return statistics.mean(values) if values else 0.0
def _mean(values: Sequence[float]) -> float:
return statistics.mean(values) if values else 0.0
def _percentile(values: Sequence[float], percentile: float) -> float:
if not values:
return 0.0
ordered = sorted(values)
if len(ordered) == 1:
return ordered[0]
index = (len(ordered) - 1) * percentile
lower = int(index)
upper = min(lower + 1, len(ordered) - 1)
if lower == upper:
return ordered[lower]
fraction = index - lower
return ordered[lower] + (ordered[upper] - ordered[lower]) * fraction
def _load_labeled_queries(path: Path, limit: Optional[int]) -> List[Dict[str, Any]]:
if not path.is_file():
raise SystemExit(f"Queries file does not exist: {path}")
output: List[Dict[str, Any]] = []
for raw_line in path.read_text(encoding="utf-8", errors="ignore").splitlines():
line = raw_line.strip()
if not line or line.startswith("#"):
continue
try:
item = json.loads(line)
except Exception as exc:
raise SystemExit(f"Invalid JSONL line in {path}: {raw_line!r} ({exc})") from exc
if not isinstance(item, dict) or "query" not in item or "relevant_paths" not in item:
raise SystemExit(f"Invalid query item (expected object with query/relevant_paths): {item!r}")
relevant_paths = item.get("relevant_paths")
if not isinstance(relevant_paths, list) or not relevant_paths:
raise SystemExit(f"Query item must include non-empty relevant_paths[]: {item!r}")
output.append(item)
if limit is not None and len(output) >= limit:
break
return output
def _resolve_expected_paths(source_root: Path, paths: Sequence[str]) -> Tuple[List[str], set[str], List[str]]:
resolved_display: List[str] = []
resolved_keys: set[str] = set()
missing: List[str] = []
for raw_path in paths:
candidate = Path(raw_path)
if not candidate.is_absolute():
candidate = (source_root / candidate).resolve()
if not candidate.exists():
missing.append(str(candidate))
resolved_display.append(str(candidate))
resolved_keys.add(_normalize_path_key(str(candidate)))
return resolved_display, resolved_keys, missing
def _validate_local_only_backends(embedding_backend: str, reranker_backend: str) -> None:
if embedding_backend not in VALID_LOCAL_EMBEDDING_BACKENDS:
raise SystemExit(
"This runner is local-only. "
f"--embedding-backend must be one of {', '.join(VALID_LOCAL_EMBEDDING_BACKENDS)}; got {embedding_backend!r}"
)
if reranker_backend not in VALID_LOCAL_RERANKER_BACKENDS:
raise SystemExit(
"This runner is local-only. "
f"--reranker-backend must be one of {', '.join(VALID_LOCAL_RERANKER_BACKENDS)}; got {reranker_backend!r}"
)
def _validate_stage2_modes(stage2_modes: Sequence[str]) -> List[str]:
normalized = [str(mode).strip().lower() for mode in stage2_modes if str(mode).strip()]
if not normalized:
raise SystemExit("At least one --stage2-modes entry is required")
invalid = [mode for mode in normalized if mode not in VALID_STAGE2_MODES]
if invalid:
raise SystemExit(
f"Invalid --stage2-modes entry: {invalid[0]} "
f"(valid: {', '.join(VALID_STAGE2_MODES)})"
)
deduped: List[str] = []
seen: set[str] = set()
for mode in normalized:
if mode in seen:
continue
seen.add(mode)
deduped.append(mode)
return deduped
def _validate_baseline_methods(methods: Sequence[str]) -> List[str]:
normalized = [str(method).strip().lower() for method in methods if str(method).strip()]
invalid = [method for method in normalized if method not in VALID_BASELINE_METHODS]
if invalid:
raise SystemExit(
f"Invalid --baseline-methods entry: {invalid[0]} "
f"(valid: {', '.join(VALID_BASELINE_METHODS)})"
)
deduped: List[str] = []
seen: set[str] = set()
for method in normalized:
if method in seen:
continue
seen.add(method)
deduped.append(method)
return deduped
@dataclass
class StrategyRun:
strategy_key: str
strategy: str
stage2_mode: Optional[str]
effective_method: str
execution_method: str
latency_ms: float
topk_paths: List[str]
first_hit_rank: Optional[int]
hit_at_k: bool
recall_at_k: float
generated_artifact_count: int
test_file_count: int
error: Optional[str] = None
@dataclass
class QueryEvaluation:
query: str
intent: Optional[str]
notes: Optional[str]
relevant_paths: List[str]
runs: Dict[str, StrategyRun]
@dataclass
class PairwiseDelta:
mode_a: str
mode_b: str
hit_at_k_delta: float
mrr_at_k_delta: float
avg_recall_at_k_delta: float
avg_latency_ms_delta: float
@dataclass
class StrategySpec:
strategy_key: str
strategy: str
stage2_mode: Optional[str]
@dataclass
class StrategyRuntime:
strategy_spec: StrategySpec
config: Config
registry: RegistryStore
engine: ChainSearchEngine
def _strategy_specs(
stage2_modes: Sequence[str],
include_dense_baseline: bool,
*,
baseline_methods: Sequence[str],
) -> List[StrategySpec]:
specs: List[StrategySpec] = []
for method in baseline_methods:
specs.append(StrategySpec(strategy_key=method, strategy=method, stage2_mode=None))
if include_dense_baseline:
specs.append(StrategySpec(strategy_key="dense_rerank", strategy="dense_rerank", stage2_mode=None))
for stage2_mode in stage2_modes:
specs.append(
StrategySpec(
strategy_key=f"staged:{stage2_mode}",
strategy="staged",
stage2_mode=stage2_mode,
)
)
return specs
def _build_strategy_runtime(base_config: Config, strategy_spec: StrategySpec) -> StrategyRuntime:
runtime_config = deepcopy(base_config)
registry = RegistryStore()
registry.initialize()
mapper = PathMapper()
engine = ChainSearchEngine(registry=registry, mapper=mapper, config=runtime_config)
return StrategyRuntime(
strategy_spec=strategy_spec,
config=runtime_config,
registry=registry,
engine=engine,
)
def _select_effective_method(query: str, requested_method: str) -> str:
requested = str(requested_method).strip().lower()
if requested != "auto":
return requested
if query_targets_generated_files(query) or query_prefers_lexical_search(query):
return "fts"
intent = detect_query_intent(query)
if intent == QueryIntent.KEYWORD:
return "fts"
if intent == QueryIntent.SEMANTIC:
return "dense_rerank"
return "hybrid"
def _filter_dataset_by_query_match(
dataset: Sequence[Dict[str, Any]],
query_match: Optional[str],
) -> List[Dict[str, Any]]:
"""Filter labeled queries by case-insensitive substring match."""
needle = str(query_match or "").strip().casefold()
if not needle:
return list(dataset)
return [
dict(item)
for item in dataset
if needle in str(item.get("query", "")).casefold()
]
def _apply_query_limit(
dataset: Sequence[Dict[str, Any]],
query_limit: Optional[int],
) -> List[Dict[str, Any]]:
"""Apply the optional query limit after any dataset-level filtering."""
if query_limit is None:
return list(dataset)
return [dict(item) for item in list(dataset)[: max(0, int(query_limit))]]
def _write_json_payload(path: Path, payload: Dict[str, Any]) -> None:
"""Persist a benchmark payload as UTF-8 JSON."""
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
def _write_final_outputs(
*,
output_path: Path,
progress_output: Optional[Path],
payload: Dict[str, Any],
) -> None:
"""Persist the final completed payload to both result and progress outputs."""
_write_json_payload(output_path, payload)
if progress_output is not None:
_write_json_payload(progress_output, payload)
def _make_progress_payload(
*,
args: argparse.Namespace,
source_root: Path,
strategy_specs: Sequence[StrategySpec],
evaluations: Sequence[QueryEvaluation],
query_index: int,
total_queries: int,
run_index: int,
total_runs: int,
current_query: str,
current_strategy_key: str,
) -> Dict[str, Any]:
"""Create a partial progress snapshot for long benchmark runs."""
return {
"status": "running",
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"source": str(source_root),
"queries_file": str(args.queries_file),
"query_count": len(evaluations),
"planned_query_count": total_queries,
"k": int(args.k),
"coarse_k": int(args.coarse_k),
"strategy_keys": [spec.strategy_key for spec in strategy_specs],
"progress": {
"completed_queries": query_index,
"total_queries": total_queries,
"completed_runs": run_index,
"total_runs": total_runs,
"current_query": current_query,
"current_strategy_key": current_strategy_key,
},
"evaluations": [
{
"query": evaluation.query,
"intent": evaluation.intent,
"notes": evaluation.notes,
"relevant_paths": evaluation.relevant_paths,
"runs": {key: asdict(run) for key, run in evaluation.runs.items()},
}
for evaluation in evaluations
],
}
def _make_search_options(method: str, *, k: int) -> SearchOptions:
normalized = str(method).strip().lower()
if normalized == "fts":
return SearchOptions(
total_limit=k,
hybrid_mode=False,
enable_fuzzy=False,
enable_vector=False,
pure_vector=False,
enable_cascade=False,
)
if normalized == "hybrid":
return SearchOptions(
total_limit=k,
hybrid_mode=True,
enable_fuzzy=False,
enable_vector=True,
pure_vector=False,
enable_cascade=False,
)
if normalized in {"dense_rerank", "staged"}:
return SearchOptions(
total_limit=k,
hybrid_mode=True,
enable_fuzzy=False,
enable_vector=True,
pure_vector=False,
enable_cascade=True,
)
raise ValueError(f"Unsupported benchmark method: {method}")
def _run_strategy(
engine: ChainSearchEngine,
config: Config,
*,
strategy_spec: StrategySpec,
query: str,
source_path: Path,
k: int,
coarse_k: int,
relevant: set[str],
) -> StrategyRun:
gc.collect()
effective_method = _select_effective_method(query, strategy_spec.strategy)
execution_method = "cascade" if effective_method in {"dense_rerank", "staged"} else effective_method
previous_cascade_strategy = getattr(config, "cascade_strategy", None)
previous_stage2_mode = getattr(config, "staged_stage2_mode", None)
start_ms = _now_ms()
try:
options = _make_search_options(
"staged" if strategy_spec.strategy == "staged" else effective_method,
k=k,
)
if strategy_spec.strategy == "staged":
config.cascade_strategy = "staged"
if strategy_spec.stage2_mode:
config.staged_stage2_mode = strategy_spec.stage2_mode
result = engine.cascade_search(
query=query,
source_path=source_path,
k=k,
coarse_k=coarse_k,
options=options,
strategy="staged",
)
elif effective_method == "dense_rerank":
config.cascade_strategy = "dense_rerank"
result = engine.cascade_search(
query=query,
source_path=source_path,
k=k,
coarse_k=coarse_k,
options=options,
strategy="dense_rerank",
)
else:
result = engine.search(
query=query,
source_path=source_path,
options=options,
)
latency_ms = _now_ms() - start_ms
paths_raw = [item.path for item in (result.results or []) if getattr(item, "path", None)]
topk = _dedup_topk((_normalize_path_key(path) for path in paths_raw), k=k)
rank = _first_hit_rank(topk, relevant)
recall = 0.0
if relevant:
recall = len(set(topk) & relevant) / float(len(relevant))
return StrategyRun(
strategy_key=strategy_spec.strategy_key,
strategy=strategy_spec.strategy,
stage2_mode=strategy_spec.stage2_mode,
effective_method=effective_method,
execution_method=execution_method,
latency_ms=latency_ms,
topk_paths=topk,
first_hit_rank=rank,
hit_at_k=rank is not None,
recall_at_k=recall,
generated_artifact_count=sum(1 for path in topk if is_generated_artifact_path(path)),
test_file_count=sum(1 for path in topk if is_test_file(path)),
error=None,
)
except Exception as exc:
latency_ms = _now_ms() - start_ms
return StrategyRun(
strategy_key=strategy_spec.strategy_key,
strategy=strategy_spec.strategy,
stage2_mode=strategy_spec.stage2_mode,
effective_method=effective_method,
execution_method=execution_method,
latency_ms=latency_ms,
topk_paths=[],
first_hit_rank=None,
hit_at_k=False,
recall_at_k=0.0,
generated_artifact_count=0,
test_file_count=0,
error=f"{type(exc).__name__}: {exc}",
)
finally:
config.cascade_strategy = previous_cascade_strategy
config.staged_stage2_mode = previous_stage2_mode
def _summarize_runs(runs: Sequence[StrategyRun]) -> Dict[str, Any]:
latencies = [run.latency_ms for run in runs if not run.error]
ranks = [run.first_hit_rank for run in runs]
effective_method_counts: Dict[str, int] = {}
for run in runs:
effective_method_counts[run.effective_method] = effective_method_counts.get(run.effective_method, 0) + 1
return {
"query_count": len(runs),
"hit_at_k": _mean([1.0 if run.hit_at_k else 0.0 for run in runs]),
"mrr_at_k": _mrr(ranks),
"avg_recall_at_k": _mean([run.recall_at_k for run in runs]),
"avg_latency_ms": _mean(latencies),
"p50_latency_ms": _percentile(latencies, 0.50),
"p95_latency_ms": _percentile(latencies, 0.95),
"avg_generated_artifact_count": _mean([float(run.generated_artifact_count) for run in runs]),
"avg_test_file_count": _mean([float(run.test_file_count) for run in runs]),
"runs_with_generated_artifacts": sum(1 for run in runs if run.generated_artifact_count > 0),
"runs_with_test_files": sum(1 for run in runs if run.test_file_count > 0),
"effective_methods": effective_method_counts,
"errors": sum(1 for run in runs if run.error),
}
def _build_pairwise_deltas(stage2_summaries: Dict[str, Dict[str, Any]]) -> List[PairwiseDelta]:
modes = list(stage2_summaries.keys())
deltas: List[PairwiseDelta] = []
for left_index in range(len(modes)):
for right_index in range(left_index + 1, len(modes)):
left = modes[left_index]
right = modes[right_index]
left_summary = stage2_summaries[left]
right_summary = stage2_summaries[right]
deltas.append(
PairwiseDelta(
mode_a=left,
mode_b=right,
hit_at_k_delta=left_summary["hit_at_k"] - right_summary["hit_at_k"],
mrr_at_k_delta=left_summary["mrr_at_k"] - right_summary["mrr_at_k"],
avg_recall_at_k_delta=left_summary["avg_recall_at_k"] - right_summary["avg_recall_at_k"],
avg_latency_ms_delta=left_summary["avg_latency_ms"] - right_summary["avg_latency_ms"],
)
)
return deltas
def _make_plan_payload(
*,
args: argparse.Namespace,
source_root: Path,
dataset: Sequence[Dict[str, Any]],
baseline_methods: Sequence[str],
stage2_modes: Sequence[str],
strategy_specs: Sequence[StrategySpec],
) -> Dict[str, Any]:
return {
"mode": "dry-run" if args.dry_run else "self-check",
"local_only": True,
"source": str(source_root),
"queries_file": str(args.queries_file),
"query_count": len(dataset),
"query_match": args.query_match,
"k": int(args.k),
"coarse_k": int(args.coarse_k),
"baseline_methods": list(baseline_methods),
"stage2_modes": list(stage2_modes),
"strategy_keys": [spec.strategy_key for spec in strategy_specs],
"local_backends": {
"embedding_backend": args.embedding_backend,
"embedding_model": args.embedding_model,
"reranker_backend": args.reranker_backend,
"reranker_model": args.reranker_model,
"embedding_use_gpu": bool(args.embedding_use_gpu),
"reranker_use_gpu": bool(args.reranker_use_gpu),
},
"output": str(args.output),
"progress_output": str(args.progress_output) if args.progress_output else None,
"dataset_preview": [
{
"query": item.get("query"),
"intent": item.get("intent"),
"relevant_paths": item.get("relevant_paths"),
}
for item in list(dataset)[: min(3, len(dataset))]
],
}
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--source",
type=Path,
default=DEFAULT_SOURCE,
help="Source root to benchmark. Defaults to the repository root so CCW and CodexLens paths resolve together.",
)
parser.add_argument(
"--queries-file",
type=Path,
default=DEFAULT_QUERIES_FILE,
help="Labeled JSONL dataset of CCW smart_search queries",
)
parser.add_argument("--query-limit", type=int, default=None, help="Optional query limit")
parser.add_argument(
"--query-match",
type=str,
default=None,
help="Optional case-insensitive substring filter for selecting specific benchmark queries.",
)
parser.add_argument("--k", type=int, default=10, help="Top-k to evaluate")
parser.add_argument("--coarse-k", type=int, default=100, help="Stage-1 coarse_k")
parser.add_argument(
"--baseline-methods",
nargs="*",
default=list(VALID_BASELINE_METHODS),
help="Requested smart_search baselines to compare before staged modes (valid: auto, fts, hybrid).",
)
parser.add_argument(
"--stage2-modes",
nargs="*",
default=list(VALID_STAGE2_MODES),
help="Stage-2 modes to compare",
)
parser.add_argument("--warmup", type=int, default=0, help="Warmup iterations per strategy")
parser.add_argument(
"--embedding-backend",
default="fastembed",
help="Local embedding backend. This runner only accepts fastembed.",
)
parser.add_argument(
"--embedding-model",
default="code",
help="Embedding model/profile for the local embedding backend",
)
parser.add_argument(
"--embedding-use-gpu",
action="store_true",
help="Enable GPU acceleration for local embeddings. Off by default for stability.",
)
parser.add_argument(
"--reranker-backend",
default="onnx",
help="Local reranker backend. Supported local values: onnx, fastembed, legacy.",
)
parser.add_argument(
"--reranker-model",
default=DEFAULT_LOCAL_ONNX_RERANKER_MODEL,
help="Reranker model name for the local reranker backend",
)
parser.add_argument(
"--reranker-use-gpu",
action="store_true",
help="Enable GPU acceleration for the local reranker. Off by default for stability.",
)
parser.add_argument(
"--skip-dense-baseline",
action="store_true",
help="Only compare staged stage2 modes and skip the dense_rerank baseline.",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Validate dataset/config and print the benchmark plan without running retrieval.",
)
parser.add_argument(
"--self-check",
action="store_true",
help="Smoke-check the entrypoint by validating dataset, source paths, and stage matrix wiring.",
)
parser.add_argument(
"--output",
type=Path,
default=DEFAULT_OUTPUT,
help="Output JSON path",
)
parser.add_argument(
"--progress-output",
type=Path,
default=None,
help="Optional JSON path updated after each query with partial progress and completed runs.",
)
return parser
def main() -> None:
parser = build_parser()
args = parser.parse_args()
source_root = args.source.expanduser().resolve()
if not source_root.exists():
raise SystemExit(f"Source path does not exist: {source_root}")
if int(args.k) <= 0:
raise SystemExit("--k must be > 0")
if int(args.coarse_k) <= 0:
raise SystemExit("--coarse-k must be > 0")
if int(args.coarse_k) < int(args.k):
raise SystemExit("--coarse-k must be >= --k")
if int(args.warmup) < 0:
raise SystemExit("--warmup must be >= 0")
embedding_backend = str(args.embedding_backend).strip().lower()
reranker_backend = str(args.reranker_backend).strip().lower()
_validate_local_only_backends(embedding_backend, reranker_backend)
baseline_methods = _validate_baseline_methods(args.baseline_methods)
stage2_modes = _validate_stage2_modes(args.stage2_modes)
dataset = _load_labeled_queries(args.queries_file, None)
dataset = _filter_dataset_by_query_match(dataset, args.query_match)
dataset = _apply_query_limit(dataset, args.query_limit)
if not dataset:
raise SystemExit("No queries to run")
missing_paths: List[str] = []
for item in dataset:
_, _, item_missing = _resolve_expected_paths(source_root, [str(path) for path in item["relevant_paths"]])
missing_paths.extend(item_missing)
if missing_paths:
preview = ", ".join(missing_paths[:3])
raise SystemExit(
"Dataset relevant_paths do not resolve under the selected source root. "
f"Examples: {preview}"
)
strategy_specs = _strategy_specs(
stage2_modes,
include_dense_baseline=not args.skip_dense_baseline,
baseline_methods=baseline_methods,
)
if args.dry_run or args.self_check:
payload = _make_plan_payload(
args=args,
source_root=source_root,
dataset=dataset,
baseline_methods=baseline_methods,
stage2_modes=stage2_modes,
strategy_specs=strategy_specs,
)
if args.self_check:
payload["status"] = "ok"
payload["checks"] = {
"dataset_loaded": True,
"stage2_matrix_size": len(stage2_modes),
"local_only_validation": True,
"source_path_exists": True,
}
print(json.dumps(payload, ensure_ascii=False, indent=2))
return
config = Config.load()
config.cascade_strategy = "staged"
config.enable_staged_rerank = True
config.enable_cross_encoder_rerank = True
config.embedding_backend = embedding_backend
config.embedding_model = str(args.embedding_model).strip()
config.embedding_use_gpu = bool(args.embedding_use_gpu)
config.embedding_auto_embed_missing = False
config.reranker_backend = reranker_backend
config.reranker_model = str(args.reranker_model).strip()
config.reranker_use_gpu = bool(args.reranker_use_gpu)
strategy_runtimes = {
spec.strategy_key: _build_strategy_runtime(config, spec)
for spec in strategy_specs
}
evaluations: List[QueryEvaluation] = []
total_queries = len(dataset)
total_runs = total_queries * len(strategy_specs)
completed_runs = 0
try:
if int(args.warmup) > 0:
warm_query = str(dataset[0]["query"]).strip()
warm_relevant_paths = [str(path) for path in dataset[0]["relevant_paths"]]
_, warm_relevant, _ = _resolve_expected_paths(source_root, warm_relevant_paths)
for spec in strategy_specs:
runtime = strategy_runtimes[spec.strategy_key]
for _ in range(int(args.warmup)):
_run_strategy(
runtime.engine,
runtime.config,
strategy_spec=spec,
query=warm_query,
source_path=source_root,
k=min(int(args.k), 5),
coarse_k=min(int(args.coarse_k), 50),
relevant=warm_relevant,
)
for index, item in enumerate(dataset, start=1):
query = str(item.get("query", "")).strip()
if not query:
continue
print(f"[query {index}/{total_queries}] {query}", flush=True)
relevant_paths, relevant, _ = _resolve_expected_paths(
source_root,
[str(path) for path in item["relevant_paths"]],
)
runs: Dict[str, StrategyRun] = {}
for spec in strategy_specs:
if args.progress_output is not None:
_write_json_payload(
args.progress_output,
_make_progress_payload(
args=args,
source_root=source_root,
strategy_specs=strategy_specs,
evaluations=evaluations,
query_index=index - 1,
total_queries=total_queries,
run_index=completed_runs,
total_runs=total_runs,
current_query=query,
current_strategy_key=spec.strategy_key,
),
)
print(
f"[run {completed_runs + 1}/{total_runs}] "
f"strategy={spec.strategy_key} query={query}",
flush=True,
)
runtime = strategy_runtimes[spec.strategy_key]
runs[spec.strategy_key] = _run_strategy(
runtime.engine,
runtime.config,
strategy_spec=spec,
query=query,
source_path=source_root,
k=int(args.k),
coarse_k=int(args.coarse_k),
relevant=relevant,
)
completed_runs += 1
run = runs[spec.strategy_key]
outcome = "error" if run.error else "ok"
print(
f"[done {completed_runs}/{total_runs}] "
f"strategy={spec.strategy_key} outcome={outcome} "
f"latency_ms={run.latency_ms:.2f} "
f"first_hit_rank={run.first_hit_rank}",
flush=True,
)
evaluations.append(
QueryEvaluation(
query=query,
intent=str(item.get("intent")) if item.get("intent") is not None else None,
notes=str(item.get("notes")) if item.get("notes") is not None else None,
relevant_paths=relevant_paths,
runs=runs,
)
)
if args.progress_output is not None:
_write_json_payload(
args.progress_output,
_make_progress_payload(
args=args,
source_root=source_root,
strategy_specs=strategy_specs,
evaluations=evaluations,
query_index=index,
total_queries=total_queries,
run_index=completed_runs,
total_runs=total_runs,
current_query=query,
current_strategy_key="complete",
),
)
finally:
for runtime in strategy_runtimes.values():
try:
runtime.engine.close()
except Exception:
pass
for runtime in strategy_runtimes.values():
try:
runtime.registry.close()
except Exception:
pass
strategy_summaries: Dict[str, Dict[str, Any]] = {}
for spec in strategy_specs:
spec_runs = [evaluation.runs[spec.strategy_key] for evaluation in evaluations if spec.strategy_key in evaluation.runs]
summary = _summarize_runs(spec_runs)
summary["strategy"] = spec.strategy
summary["stage2_mode"] = spec.stage2_mode
strategy_summaries[spec.strategy_key] = summary
stage2_mode_matrix = {
mode: strategy_summaries[f"staged:{mode}"]
for mode in stage2_modes
if f"staged:{mode}" in strategy_summaries
}
pairwise_deltas = [asdict(item) for item in _build_pairwise_deltas(stage2_mode_matrix)]
payload = {
"status": "completed",
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"source": str(source_root),
"queries_file": str(args.queries_file),
"query_count": len(evaluations),
"query_match": args.query_match,
"k": int(args.k),
"coarse_k": int(args.coarse_k),
"local_only": True,
"strategies": strategy_summaries,
"stage2_mode_matrix": stage2_mode_matrix,
"pairwise_stage2_deltas": pairwise_deltas,
"config": {
"embedding_backend": config.embedding_backend,
"embedding_model": config.embedding_model,
"embedding_use_gpu": bool(config.embedding_use_gpu),
"reranker_backend": config.reranker_backend,
"reranker_model": config.reranker_model,
"reranker_use_gpu": bool(config.reranker_use_gpu),
"enable_staged_rerank": bool(config.enable_staged_rerank),
"enable_cross_encoder_rerank": bool(config.enable_cross_encoder_rerank),
},
"progress_output": str(args.progress_output) if args.progress_output else None,
"evaluations": [
{
"query": evaluation.query,
"intent": evaluation.intent,
"notes": evaluation.notes,
"relevant_paths": evaluation.relevant_paths,
"runs": {key: asdict(run) for key, run in evaluation.runs.items()},
}
for evaluation in evaluations
],
}
_write_final_outputs(
output_path=args.output,
progress_output=args.progress_output,
payload=payload,
)
print(json.dumps(payload, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()

View File

@@ -1,405 +0,0 @@
"""Compare Binary Cascade and Vector semantic search methods.
This script compares the two semantic retrieval approaches:
1. Binary Cascade: 256-bit binary vectors for coarse ranking
2. Vector Dense: Full semantic embeddings with cosine similarity
"""
import sys
import time
from pathlib import Path
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from codexlens.storage.dir_index import DirIndexStore
from codexlens.semantic.vector_store import VectorStore
def get_filename(path: str) -> str:
"""Extract filename from path."""
if "\\" in path:
return path.split("\\")[-1]
elif "/" in path:
return path.split("/")[-1]
return path
def find_binary_indexes(index_root: Path):
"""Find all binary index files."""
return list(index_root.rglob("_index_binary_vectors.bin"))
# Test queries for semantic search comparison
TEST_QUERIES = [
"how to search code semantically",
"embedding generation for files",
"hybrid search with multiple backends",
"parse python source code",
"database storage for vectors",
]
# Index paths
INDEX_ROOT = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens")
def test_vector_search(query: str, limit: int = 10):
"""Test dense vector search."""
try:
from codexlens.semantic.factory import get_embedder
# Find an index with embeddings
all_results = []
total_time = 0
for index_db in INDEX_ROOT.rglob("_index.db"):
vector_store = VectorStore(index_db)
if vector_store.count_chunks() == 0:
continue
# Get embedder based on stored config
model_config = vector_store.get_model_config()
if model_config:
backend = model_config.get("backend", "fastembed")
model_name = model_config["model_name"]
model_profile = model_config["model_profile"]
if backend == "litellm":
embedder = get_embedder(backend="litellm", model=model_name)
else:
embedder = get_embedder(backend="fastembed", profile=model_profile)
else:
embedder = get_embedder(backend="fastembed", profile="code")
start = time.perf_counter()
query_embedding = embedder.embed_single(query)
results = vector_store.search_similar(
query_embedding=query_embedding,
top_k=limit,
min_score=0.0,
return_full_content=True,
)
total_time += (time.perf_counter() - start) * 1000
all_results.extend(results)
# Only need one successful search to get embedder initialized
if results:
break
# Sort by score and limit
all_results.sort(key=lambda x: x.score, reverse=True)
return all_results[:limit], total_time, None
except Exception as e:
return [], 0, str(e)
def test_binary_cascade_search(query: str, limit: int = 10):
"""Test binary cascade search (binary coarse + dense fine ranking)."""
try:
from codexlens.semantic.ann_index import BinaryANNIndex
from codexlens.indexing.embedding import CascadeEmbeddingBackend
import numpy as np
import sqlite3
# Find binary indexes
binary_indexes = find_binary_indexes(INDEX_ROOT)
if not binary_indexes:
return [], 0, "No binary indexes found. Run 'codexlens cascade-index' first."
start = time.perf_counter()
# Initialize cascade backend for query encoding
cascade_backend = CascadeEmbeddingBackend()
# Encode query to binary and dense
binary_embeddings, dense_embeddings = cascade_backend.encode_cascade([query], batch_size=1)
query_binary = binary_embeddings[0]
query_dense = dense_embeddings[0]
all_results = []
for binary_index_path in binary_indexes:
# Find corresponding index.db
index_db = binary_index_path.parent / "_index.db"
if not index_db.exists():
continue
# Check if cascade embeddings exist
conn = sqlite3.connect(index_db)
conn.row_factory = sqlite3.Row
try:
cursor = conn.execute(
"SELECT COUNT(*) FROM semantic_chunks WHERE embedding_binary IS NOT NULL"
)
binary_count = cursor.fetchone()[0]
if binary_count == 0:
conn.close()
continue
except Exception:
conn.close()
continue
# Stage 1: Binary coarse search
binary_index = BinaryANNIndex(index_db, dim=256)
try:
binary_index.load()
except Exception:
conn.close()
continue
# Pack query for binary search
from codexlens.indexing.embedding import pack_binary_embedding
query_binary_packed = pack_binary_embedding(query_binary)
# Get top candidates
coarse_limit = min(limit * 10, 100)
# search returns (ids, distances) tuple
coarse_ids, coarse_distances = binary_index.search(query_binary_packed, top_k=coarse_limit)
if not coarse_ids:
conn.close()
continue
# Stage 2: Dense reranking
chunk_ids = coarse_ids
placeholders = ",".join("?" * len(chunk_ids))
cursor = conn.execute(
f"""
SELECT id, file_path, content, embedding_dense
FROM semantic_chunks
WHERE id IN ({placeholders}) AND embedding_dense IS NOT NULL
""",
chunk_ids
)
rows = cursor.fetchall()
# Compute dense scores
for row in rows:
chunk_id = row["id"]
file_path = row["file_path"]
content = row["content"]
dense_blob = row["embedding_dense"]
if dense_blob:
dense_vec = np.frombuffer(dense_blob, dtype=np.float32)
# Cosine similarity
score = float(np.dot(query_dense, dense_vec) / (
np.linalg.norm(query_dense) * np.linalg.norm(dense_vec) + 1e-8
))
else:
score = 0.0
all_results.append({
"path": file_path,
"score": score,
"content": content[:200] + "..." if len(content) > 200 else content,
})
conn.close()
# Sort by dense score and limit
all_results.sort(key=lambda x: x["score"], reverse=True)
final_results = all_results[:limit]
elapsed = (time.perf_counter() - start) * 1000
return final_results, elapsed, None
except ImportError as e:
return [], 0, f"Import error: {e}"
except Exception as e:
import traceback
return [], 0, f"{str(e)}\n{traceback.format_exc()}"
def print_results(method_name: str, results, elapsed: float, error: str = None):
"""Print search results in a formatted way."""
print(f"\n{'='*60}")
print(f"Method: {method_name}")
print(f"{'='*60}")
if error:
print(f"ERROR: {error}")
return
print(f"Results: {len(results)}, Time: {elapsed:.1f}ms")
print("-" * 60)
for i, r in enumerate(results[:5], 1):
if isinstance(r, dict):
path = r.get("path", "?")
score = r.get("score", 0)
content = r.get("content", "")[:80]
else:
path = getattr(r, "path", "?")
score = getattr(r, "score", 0)
content = getattr(r, "content", "")[:80] if hasattr(r, "content") else ""
filename = get_filename(path)
print(f" {i}. [{score:.4f}] {filename}")
if content:
# Sanitize content for console output
safe_content = content.encode('ascii', 'replace').decode('ascii')
print(f" {safe_content}...")
def compare_overlap(results1, results2, name1: str, name2: str):
"""Compare result overlap between two methods."""
def get_paths(results):
paths = set()
for r in results[:10]:
if isinstance(r, dict):
paths.add(r.get("path", ""))
else:
paths.add(getattr(r, "path", ""))
return paths
paths1 = get_paths(results1)
paths2 = get_paths(results2)
if not paths1 or not paths2:
return 0.0
overlap = len(paths1 & paths2)
union = len(paths1 | paths2)
jaccard = overlap / union if union > 0 else 0.0
print(f" {name1} vs {name2}: {overlap} common files (Jaccard: {jaccard:.2f})")
return jaccard
def main():
print("=" * 70)
print("SEMANTIC SEARCH METHODS COMPARISON")
print("Binary Cascade vs Vector Dense")
print("=" * 70)
# Check prerequisites
print("\n[Prerequisites Check]")
print(f" Index Root: {INDEX_ROOT}")
binary_indexes = find_binary_indexes(INDEX_ROOT)
print(f" Binary Indexes: {len(binary_indexes)} found")
for bi in binary_indexes[:3]:
print(f" - {bi.parent.name}/{bi.name}")
if len(binary_indexes) > 3:
print(f" ... and {len(binary_indexes) - 3} more")
# Aggregate statistics
all_results = {
"binary": {"total_results": 0, "total_time": 0, "queries": 0, "errors": []},
"vector": {"total_results": 0, "total_time": 0, "queries": 0, "errors": []},
}
overlap_scores = {"binary_vector": []}
for query in TEST_QUERIES:
print(f"\n{'#'*70}")
print(f"QUERY: \"{query}\"")
print("#" * 70)
# Test each method
binary_results, binary_time, binary_err = test_binary_cascade_search(query)
vector_results, vector_time, vector_err = test_vector_search(query)
# Print results
print_results("Binary Cascade (256-bit + Dense Rerank)", binary_results, binary_time, binary_err)
print_results("Vector Dense (Semantic Embeddings)", vector_results, vector_time, vector_err)
# Update statistics
if not binary_err:
all_results["binary"]["total_results"] += len(binary_results)
all_results["binary"]["total_time"] += binary_time
all_results["binary"]["queries"] += 1
else:
all_results["binary"]["errors"].append(binary_err)
if not vector_err:
all_results["vector"]["total_results"] += len(vector_results)
all_results["vector"]["total_time"] += vector_time
all_results["vector"]["queries"] += 1
else:
all_results["vector"]["errors"].append(vector_err)
# Compare overlap
print("\n[Result Overlap Analysis]")
if binary_results and vector_results:
j = compare_overlap(binary_results, vector_results, "Binary", "Vector")
overlap_scores["binary_vector"].append(j)
# Print summary
print("\n" + "=" * 70)
print("SUMMARY STATISTICS")
print("=" * 70)
for method, stats in all_results.items():
queries = stats["queries"]
if queries > 0:
avg_results = stats["total_results"] / queries
avg_time = stats["total_time"] / queries
print(f"\n{method.upper()}:")
print(f" Successful queries: {queries}/{len(TEST_QUERIES)}")
print(f" Avg results: {avg_results:.1f}")
print(f" Avg time: {avg_time:.1f}ms")
else:
print(f"\n{method.upper()}: No successful queries")
if stats["errors"]:
# Show truncated error
err = stats["errors"][0]
if len(err) > 200:
err = err[:200] + "..."
print(f" Error: {err}")
print("\n[Average Overlap Scores]")
for pair, scores in overlap_scores.items():
if scores:
avg = sum(scores) / len(scores)
print(f" {pair}: {avg:.3f}")
print("\n" + "=" * 70)
print("ANALYSIS")
print("=" * 70)
# Analyze working methods
working_methods = [m for m, s in all_results.items() if s["queries"] > 0]
if len(working_methods) == 2:
# All methods working - compare quality
print("\nBoth methods working. Quality comparison:")
# Compare avg results
print("\n Result Coverage (higher = more recall):")
for m in ["vector", "binary"]:
stats = all_results[m]
if stats["queries"] > 0:
avg = stats["total_results"] / stats["queries"]
print(f" {m.upper()}: {avg:.1f} results/query")
# Compare speed
print("\n Speed (lower = faster):")
for m in ["binary", "vector"]:
stats = all_results[m]
if stats["queries"] > 0:
avg = stats["total_time"] / stats["queries"]
print(f" {m.upper()}: {avg:.1f}ms")
# Recommend fusion strategy
print("\n Recommended Fusion Strategy:")
print(" For quality-focused hybrid search:")
print(" 1. Run both methods in parallel")
print(" 2. Use RRF fusion with weights:")
print(" - Vector: 0.6 (best semantic understanding)")
print(" - Binary: 0.4 (fast coarse filtering)")
print(" 3. Apply CrossEncoder reranking on top-50")
elif len(working_methods) >= 2:
print(f"\n{len(working_methods)} methods working: {', '.join(working_methods)}")
print("Consider fixing missing method for complete hybrid search.")
else:
print(f"\nOnly {working_methods[0] if working_methods else 'no'} method(s) working.")
print("Check your index setup.")
if __name__ == "__main__":
main()

View File

@@ -1,393 +0,0 @@
#!/usr/bin/env python
"""Compare staged realtime LSP pipeline vs direct dense->rerank cascade.
This benchmark compares two retrieval pipelines:
1) staged+realtime: coarse (binary or dense fallback) -> realtime LSP graph expand -> clustering -> rerank
2) dense_rerank: dense ANN coarse -> cross-encoder rerank
Because most repos do not have ground-truth labels, this script reports:
- latency statistics
- top-k overlap metrics (Jaccard + RBO)
- diversity proxies (unique files/dirs)
- staged pipeline stage stats (if present)
Usage:
python benchmarks/compare_staged_realtime_vs_dense_rerank.py --source ./src
python benchmarks/compare_staged_realtime_vs_dense_rerank.py --queries-file benchmarks/queries.txt
"""
from __future__ import annotations
import argparse
import gc
import json
import os
import re
import statistics
import sys
import time
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple
# Add src to path (match other benchmark scripts)
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from codexlens.config import Config
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
from codexlens.storage.path_mapper import PathMapper
from codexlens.storage.registry import RegistryStore
DEFAULT_QUERIES = [
"class Config",
"def search",
"LspBridge",
"graph expansion",
"clustering strategy",
"error handling",
"how to parse json",
]
def _now_ms() -> float:
return time.perf_counter() * 1000.0
def _safe_relpath(path: str, root: Path) -> str:
try:
return str(Path(path).resolve().relative_to(root.resolve()))
except Exception:
return path
def _normalize_path_key(path: str) -> str:
"""Normalize file paths for overlap/dedup metrics (Windows-safe)."""
try:
p = Path(path)
# Don't explode on non-files like "<memory>".
if str(p) and (p.is_absolute() or re.match(r"^[A-Za-z]:", str(p))):
norm = str(p.resolve())
else:
norm = str(p)
except Exception:
norm = path
norm = norm.replace("/", "\\")
if os.name == "nt":
norm = norm.lower()
return norm
def _extract_stage_stats(errors: List[str]) -> Optional[Dict[str, Any]]:
"""Extract STAGE_STATS JSON blob from SearchStats.errors."""
for item in errors or []:
if not isinstance(item, str):
continue
if not item.startswith("STAGE_STATS:"):
continue
payload = item[len("STAGE_STATS:") :]
try:
return json.loads(payload)
except Exception:
return None
return None
def jaccard_topk(a: List[str], b: List[str]) -> float:
sa, sb = set(a), set(b)
if not sa and not sb:
return 1.0
if not sa or not sb:
return 0.0
return len(sa & sb) / len(sa | sb)
def rbo(a: List[str], b: List[str], p: float = 0.9) -> float:
"""Rank-biased overlap for two ranked lists."""
if p <= 0.0 or p >= 1.0:
raise ValueError("p must be in (0, 1)")
if not a and not b:
return 1.0
depth = max(len(a), len(b))
seen_a: set[str] = set()
seen_b: set[str] = set()
score = 0.0
for d in range(1, depth + 1):
if d <= len(a):
seen_a.add(a[d - 1])
if d <= len(b):
seen_b.add(b[d - 1])
overlap = len(seen_a & seen_b)
score += (overlap / d) * ((1.0 - p) * (p ** (d - 1)))
return score
def _unique_parent_dirs(paths: Iterable[str]) -> int:
dirs = set()
for p in paths:
try:
dirs.add(str(Path(p).parent))
except Exception:
continue
return len(dirs)
@dataclass
class RunDetail:
strategy: str
query: str
latency_ms: float
num_results: int
topk_paths: List[str]
stage_stats: Optional[Dict[str, Any]] = None
error: Optional[str] = None
@dataclass
class CompareDetail:
query: str
staged: RunDetail
dense_rerank: RunDetail
jaccard_topk: float
rbo_topk: float
staged_unique_files_topk: int
dense_unique_files_topk: int
staged_unique_dirs_topk: int
dense_unique_dirs_topk: int
def _run_once(
engine: ChainSearchEngine,
query: str,
source_path: Path,
*,
strategy: str,
k: int,
coarse_k: int,
options: Optional[SearchOptions] = None,
) -> RunDetail:
gc.collect()
start_ms = _now_ms()
try:
result = engine.cascade_search(
query=query,
source_path=source_path,
k=k,
coarse_k=coarse_k,
options=options,
strategy=strategy,
)
latency_ms = _now_ms() - start_ms
paths_raw = [r.path for r in (result.results or []) if getattr(r, "path", None)]
paths = [_normalize_path_key(p) for p in paths_raw]
topk: List[str] = []
seen: set[str] = set()
for p in paths:
if p in seen:
continue
seen.add(p)
topk.append(p)
if len(topk) >= k:
break
stage_stats = _extract_stage_stats(getattr(result.stats, "errors", []))
return RunDetail(
strategy=strategy,
query=query,
latency_ms=latency_ms,
num_results=len(paths),
topk_paths=topk,
stage_stats=stage_stats,
)
except Exception as exc:
latency_ms = _now_ms() - start_ms
return RunDetail(
strategy=strategy,
query=query,
latency_ms=latency_ms,
num_results=0,
topk_paths=[],
stage_stats=None,
error=repr(exc),
)
def _load_queries(path: Optional[Path], limit: Optional[int]) -> List[str]:
if path is None:
queries = list(DEFAULT_QUERIES)
else:
raw = path.read_text(encoding="utf-8", errors="ignore").splitlines()
queries = []
for line in raw:
line = line.strip()
if not line or line.startswith("#"):
continue
queries.append(line)
if limit is not None:
return queries[:limit]
return queries
def main() -> None:
parser = argparse.ArgumentParser(
description="Compare staged realtime LSP pipeline vs direct dense_rerank cascade"
)
parser.add_argument(
"--source",
type=Path,
default=Path(__file__).parent.parent / "src",
help="Source directory to search (default: ./src)",
)
parser.add_argument(
"--queries-file",
type=Path,
default=None,
help="Optional file with one query per line (# comments supported)",
)
parser.add_argument("--queries", type=int, default=None, help="Limit number of queries")
parser.add_argument("--k", type=int, default=10, help="Final result count (default 10)")
parser.add_argument("--coarse-k", type=int, default=100, help="Coarse candidates (default 100)")
parser.add_argument("--warmup", type=int, default=1, help="Warmup runs per strategy (default 1)")
parser.add_argument(
"--staged-cluster-strategy",
type=str,
default=None,
help="Override Config.staged_clustering_strategy for staged pipeline (e.g. auto, dir_rr, score, path)",
)
parser.add_argument(
"--output",
type=Path,
default=Path(__file__).parent / "results" / "staged_realtime_vs_dense_rerank.json",
help="Output JSON path",
)
args = parser.parse_args()
if not args.source.exists():
raise SystemExit(f"Source path does not exist: {args.source}")
queries = _load_queries(args.queries_file, args.queries)
if not queries:
raise SystemExit("No queries to run")
# Match CLI behavior: load settings + apply global/workspace .env overrides.
# This is important on Windows where ONNX/DirectML can sometimes crash under load;
# many users pin EMBEDDING_BACKEND=litellm in ~/.codexlens/.env for stability.
config = Config.load()
config.cascade_strategy = "staged"
config.staged_stage2_mode = "realtime"
config.enable_staged_rerank = True
if args.staged_cluster_strategy:
config.staged_clustering_strategy = str(args.staged_cluster_strategy)
# Stability: on some Windows setups, fastembed + DirectML can crash under load.
# Force local embeddings and reranking onto CPU for reproducible benchmark runs.
config.embedding_use_gpu = False
config.reranker_use_gpu = False
registry = RegistryStore()
registry.initialize()
mapper = PathMapper()
engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config)
try:
strategies = ["staged", "dense_rerank"]
# Warmup
if args.warmup > 0:
warm_query = queries[0]
for s in strategies:
for _ in range(args.warmup):
try:
_run_once(
engine,
warm_query,
args.source,
strategy=s,
k=min(args.k, 5),
coarse_k=min(args.coarse_k, 50),
)
except Exception:
pass
comparisons: List[CompareDetail] = []
for i, query in enumerate(queries, start=1):
print(f"[{i}/{len(queries)}] {query}")
staged = _run_once(
engine,
query,
args.source,
strategy="staged",
k=args.k,
coarse_k=args.coarse_k,
)
dense = _run_once(
engine,
query,
args.source,
strategy="dense_rerank",
k=args.k,
coarse_k=args.coarse_k,
)
staged_paths = staged.topk_paths
dense_paths = dense.topk_paths
comparisons.append(
CompareDetail(
query=query,
staged=staged,
dense_rerank=dense,
jaccard_topk=jaccard_topk(staged_paths, dense_paths),
rbo_topk=rbo(staged_paths, dense_paths, p=0.9),
staged_unique_files_topk=len(set(staged_paths)),
dense_unique_files_topk=len(set(dense_paths)),
staged_unique_dirs_topk=_unique_parent_dirs(staged_paths),
dense_unique_dirs_topk=_unique_parent_dirs(dense_paths),
)
)
def _latencies(details: List[RunDetail]) -> List[float]:
return [d.latency_ms for d in details if not d.error]
staged_runs = [c.staged for c in comparisons]
dense_runs = [c.dense_rerank for c in comparisons]
summary = {
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"source": str(args.source),
"k": args.k,
"coarse_k": args.coarse_k,
"query_count": len(comparisons),
"avg_jaccard_topk": statistics.mean([c.jaccard_topk for c in comparisons]) if comparisons else 0.0,
"avg_rbo_topk": statistics.mean([c.rbo_topk for c in comparisons]) if comparisons else 0.0,
"staged": {
"success": sum(1 for r in staged_runs if not r.error),
"avg_latency_ms": statistics.mean(_latencies(staged_runs)) if _latencies(staged_runs) else 0.0,
},
"dense_rerank": {
"success": sum(1 for r in dense_runs if not r.error),
"avg_latency_ms": statistics.mean(_latencies(dense_runs)) if _latencies(dense_runs) else 0.0,
},
}
args.output.parent.mkdir(parents=True, exist_ok=True)
payload = {
"summary": summary,
"comparisons": [asdict(c) for c in comparisons],
}
args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
print(f"\nSaved: {args.output}")
finally:
try:
engine.close()
except Exception as exc:
print(f"WARNING engine.close() failed: {exc!r}", file=sys.stderr)
try:
registry.close()
except Exception as exc:
print(f"WARNING registry.close() failed: {exc!r}", file=sys.stderr)
if __name__ == "__main__":
main()

View File

@@ -1,391 +0,0 @@
#!/usr/bin/env python
"""Compare staged cascade Stage-2 modes (precomputed vs realtime vs static graph).
This benchmark compares the *same* staged cascade strategy with different Stage-2
expansion sources:
1) precomputed: per-dir `graph_neighbors` expansion (fast, index-local)
2) realtime: live LSP graph expansion (contextual, requires LSP availability)
3) static_global_graph: global_relationships expansion (project-wide, requires static graph indexing)
Because most repos do not have ground-truth labels, this script reports:
- latency statistics per mode
- top-k overlap metrics (Jaccard + RBO) between modes
- diversity proxies (unique files/dirs)
- staged pipeline stage stats (when present)
Usage:
python benchmarks/compare_staged_stage2_modes.py --source ./src
python benchmarks/compare_staged_stage2_modes.py --queries-file benchmarks/queries.txt
"""
from __future__ import annotations
import argparse
import gc
import json
import os
import re
import statistics
import sys
import time
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple
# Add src to path (match other benchmark scripts)
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from codexlens.config import Config
from codexlens.search.chain_search import ChainSearchEngine
from codexlens.storage.path_mapper import PathMapper
from codexlens.storage.registry import RegistryStore
DEFAULT_QUERIES = [
"class Config",
"def search",
"LspBridge",
"graph expansion",
"static graph relationships",
"clustering strategy",
"error handling",
]
VALID_STAGE2_MODES = ("precomputed", "realtime", "static_global_graph")
def _now_ms() -> float:
return time.perf_counter() * 1000.0
def _normalize_path_key(path: str) -> str:
"""Normalize file paths for overlap/dedup metrics (Windows-safe)."""
try:
p = Path(path)
if str(p) and (p.is_absolute() or re.match(r"^[A-Za-z]:", str(p))):
norm = str(p.resolve())
else:
norm = str(p)
except Exception:
norm = path
norm = norm.replace("/", "\\")
if os.name == "nt":
norm = norm.lower()
return norm
def _extract_stage_stats(errors: List[str]) -> Optional[Dict[str, Any]]:
"""Extract STAGE_STATS JSON blob from SearchStats.errors."""
for item in errors or []:
if not isinstance(item, str):
continue
if not item.startswith("STAGE_STATS:"):
continue
payload = item[len("STAGE_STATS:") :]
try:
return json.loads(payload)
except Exception:
return None
return None
def jaccard_topk(a: List[str], b: List[str]) -> float:
sa, sb = set(a), set(b)
if not sa and not sb:
return 1.0
if not sa or not sb:
return 0.0
return len(sa & sb) / len(sa | sb)
def rbo(a: List[str], b: List[str], p: float = 0.9) -> float:
"""Rank-biased overlap for two ranked lists."""
if p <= 0.0 or p >= 1.0:
raise ValueError("p must be in (0, 1)")
if not a and not b:
return 1.0
depth = max(len(a), len(b))
seen_a: set[str] = set()
seen_b: set[str] = set()
score = 0.0
for d in range(1, depth + 1):
if d <= len(a):
seen_a.add(a[d - 1])
if d <= len(b):
seen_b.add(b[d - 1])
overlap = len(seen_a & seen_b)
score += (overlap / d) * ((1.0 - p) * (p ** (d - 1)))
return score
def _unique_parent_dirs(paths: Iterable[str]) -> int:
dirs = set()
for p in paths:
try:
dirs.add(str(Path(p).parent))
except Exception:
continue
return len(dirs)
def _load_queries(path: Optional[Path], inline: Optional[List[str]]) -> List[str]:
if inline:
return [q.strip() for q in inline if isinstance(q, str) and q.strip()]
if path:
if not path.exists():
raise SystemExit(f"Queries file does not exist: {path}")
raw = path.read_text(encoding="utf-8", errors="ignore")
queries = [line.strip() for line in raw.splitlines() if line.strip() and not line.strip().startswith("#")]
return queries
return list(DEFAULT_QUERIES)
@dataclass
class RunDetail:
stage2_mode: str
query: str
latency_ms: float
num_results: int
topk_paths: List[str]
stage_stats: Optional[Dict[str, Any]] = None
error: Optional[str] = None
@dataclass
class PairwiseCompare:
query: str
mode_a: str
mode_b: str
jaccard_topk: float
rbo_topk: float
a_unique_files_topk: int
b_unique_files_topk: int
a_unique_dirs_topk: int
b_unique_dirs_topk: int
def _run_once(
engine: ChainSearchEngine,
config: Config,
query: str,
source_path: Path,
*,
stage2_mode: str,
k: int,
coarse_k: int,
) -> RunDetail:
if stage2_mode not in VALID_STAGE2_MODES:
raise ValueError(f"Invalid stage2_mode: {stage2_mode}")
# Mutate config for this run; ChainSearchEngine reads config fields per-call.
config.staged_stage2_mode = stage2_mode
gc.collect()
start_ms = _now_ms()
try:
result = engine.cascade_search(
query=query,
source_path=source_path,
k=k,
coarse_k=coarse_k,
strategy="staged",
)
latency_ms = _now_ms() - start_ms
paths_raw = [r.path for r in (result.results or []) if getattr(r, "path", None)]
paths = [_normalize_path_key(p) for p in paths_raw]
topk: List[str] = []
seen: set[str] = set()
for p in paths:
if p in seen:
continue
seen.add(p)
topk.append(p)
if len(topk) >= k:
break
stage_stats = None
try:
stage_stats = _extract_stage_stats(getattr(result.stats, "errors", []) or [])
except Exception:
stage_stats = None
return RunDetail(
stage2_mode=stage2_mode,
query=query,
latency_ms=latency_ms,
num_results=len(result.results or []),
topk_paths=topk,
stage_stats=stage_stats,
error=None,
)
except Exception as exc:
return RunDetail(
stage2_mode=stage2_mode,
query=query,
latency_ms=_now_ms() - start_ms,
num_results=0,
topk_paths=[],
stage_stats=None,
error=str(exc),
)
def main() -> None:
parser = argparse.ArgumentParser(description="Compare staged Stage-2 expansion modes.")
parser.add_argument("--source", type=Path, default=Path.cwd(), help="Project path to search")
parser.add_argument("--queries-file", type=Path, default=None, help="Optional newline-delimited queries file")
parser.add_argument("--queries", nargs="*", default=None, help="Inline queries (overrides queries-file)")
parser.add_argument("--k", type=int, default=20, help="Top-k to evaluate")
parser.add_argument("--coarse-k", type=int, default=100, help="Stage-1 coarse_k")
parser.add_argument(
"--stage2-modes",
nargs="*",
default=list(VALID_STAGE2_MODES),
help="Stage-2 modes to compare",
)
parser.add_argument("--warmup", type=int, default=0, help="Warmup iterations per mode")
parser.add_argument(
"--output",
type=Path,
default=Path(__file__).parent / "results" / "staged_stage2_modes.json",
help="Output JSON path",
)
args = parser.parse_args()
if not args.source.exists():
raise SystemExit(f"Source path does not exist: {args.source}")
stage2_modes = [str(m).strip().lower() for m in (args.stage2_modes or []) if str(m).strip()]
for m in stage2_modes:
if m not in VALID_STAGE2_MODES:
raise SystemExit(f"Invalid --stage2-modes entry: {m} (valid: {', '.join(VALID_STAGE2_MODES)})")
queries = _load_queries(args.queries_file, args.queries)
if not queries:
raise SystemExit("No queries to run")
# Match CLI behavior: load settings + apply global/workspace .env overrides.
config = Config.load()
config.cascade_strategy = "staged"
config.enable_staged_rerank = True
config.embedding_use_gpu = False # stability on some Windows setups
registry = RegistryStore()
registry.initialize()
mapper = PathMapper()
engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config)
try:
# Warmup
if args.warmup > 0:
warm_query = queries[0]
for mode in stage2_modes:
for _ in range(args.warmup):
try:
_run_once(
engine,
config,
warm_query,
args.source,
stage2_mode=mode,
k=min(args.k, 5),
coarse_k=min(args.coarse_k, 50),
)
except Exception:
pass
per_query: Dict[str, Dict[str, RunDetail]] = {}
runs: List[RunDetail] = []
comparisons: List[PairwiseCompare] = []
for i, query in enumerate(queries, start=1):
print(f"[{i}/{len(queries)}] {query}")
per_query[query] = {}
for mode in stage2_modes:
detail = _run_once(
engine,
config,
query,
args.source,
stage2_mode=mode,
k=args.k,
coarse_k=args.coarse_k,
)
per_query[query][mode] = detail
runs.append(detail)
# Pairwise overlaps for this query
for a_idx in range(len(stage2_modes)):
for b_idx in range(a_idx + 1, len(stage2_modes)):
mode_a = stage2_modes[a_idx]
mode_b = stage2_modes[b_idx]
a = per_query[query][mode_a]
b = per_query[query][mode_b]
comparisons.append(
PairwiseCompare(
query=query,
mode_a=mode_a,
mode_b=mode_b,
jaccard_topk=jaccard_topk(a.topk_paths, b.topk_paths),
rbo_topk=rbo(a.topk_paths, b.topk_paths, p=0.9),
a_unique_files_topk=len(set(a.topk_paths)),
b_unique_files_topk=len(set(b.topk_paths)),
a_unique_dirs_topk=_unique_parent_dirs(a.topk_paths),
b_unique_dirs_topk=_unique_parent_dirs(b.topk_paths),
)
)
def _latencies(details: List[RunDetail]) -> List[float]:
return [d.latency_ms for d in details if not d.error]
mode_summaries: Dict[str, Dict[str, Any]] = {}
for mode in stage2_modes:
mode_runs = [r for r in runs if r.stage2_mode == mode]
lat = _latencies(mode_runs)
mode_summaries[mode] = {
"success": sum(1 for r in mode_runs if not r.error),
"avg_latency_ms": statistics.mean(lat) if lat else 0.0,
"p50_latency_ms": statistics.median(lat) if lat else 0.0,
"p95_latency_ms": statistics.quantiles(lat, n=20)[18] if len(lat) >= 2 else (lat[0] if lat else 0.0),
}
summary = {
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"source": str(args.source),
"k": args.k,
"coarse_k": args.coarse_k,
"query_count": len(queries),
"stage2_modes": stage2_modes,
"modes": mode_summaries,
"avg_pairwise_jaccard_topk": statistics.mean([c.jaccard_topk for c in comparisons]) if comparisons else 0.0,
"avg_pairwise_rbo_topk": statistics.mean([c.rbo_topk for c in comparisons]) if comparisons else 0.0,
}
args.output.parent.mkdir(parents=True, exist_ok=True)
payload = {
"summary": summary,
"runs": [asdict(r) for r in runs],
"comparisons": [asdict(c) for c in comparisons],
}
args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
print(f"\nSaved: {args.output}")
finally:
try:
engine.close()
except Exception as exc:
print(f"WARNING engine.close() failed: {exc!r}", file=sys.stderr)
try:
registry.close()
except Exception as exc:
print(f"WARNING registry.close() failed: {exc!r}", file=sys.stderr)
if __name__ == "__main__":
main()

View File

@@ -1,527 +0,0 @@
"""Analysis script for hybrid search method contribution and storage architecture.
This script analyzes:
1. Individual method contribution in hybrid search (FTS/Vector)
2. Storage architecture conflicts between different retrieval methods
3. FTS + Rerank fusion experiment
"""
import json
import sqlite3
import time
from pathlib import Path
from typing import Dict, List, Tuple, Any
from collections import defaultdict
# Add project root to path
import sys
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from codexlens.storage.registry import RegistryStore
from codexlens.storage.path_mapper import PathMapper
from codexlens.search.hybrid_search import HybridSearchEngine
from codexlens.search.ranking import (
reciprocal_rank_fusion,
cross_encoder_rerank,
DEFAULT_WEIGHTS,
)
from codexlens.entities import SearchResult
def find_project_index(source_path: Path) -> Path:
"""Find the index database for a project."""
registry = RegistryStore()
registry.initialize()
mapper = PathMapper()
index_path = mapper.source_to_index_db(source_path)
if not index_path.exists():
nearest = registry.find_nearest_index(source_path)
if nearest:
index_path = nearest.index_path
registry.close()
return index_path
def analyze_storage_architecture(index_path: Path) -> Dict[str, Any]:
"""Analyze storage tables and check for conflicts.
Returns:
Dictionary with table analysis and conflict detection.
"""
results = {
"tables": {},
"conflicts": [],
"recommendations": []
}
with sqlite3.connect(index_path) as conn:
# Get all tables
cursor = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
)
tables = [row[0] for row in cursor.fetchall()]
for table in tables:
# Get row count and columns
try:
count = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
cols = conn.execute(f"PRAGMA table_info({table})").fetchall()
col_names = [c[1] for c in cols]
results["tables"][table] = {
"row_count": count,
"columns": col_names
}
except Exception as e:
results["tables"][table] = {"error": str(e)}
# Check for data overlap/conflicts
# 1. Check if chunks and semantic_chunks have different data
if "chunks" in tables and "semantic_chunks" in tables:
chunks_count = results["tables"]["chunks"]["row_count"]
semantic_count = results["tables"]["semantic_chunks"]["row_count"]
if chunks_count > 0 and semantic_count > 0:
# Check for ID overlap
overlap = conn.execute("""
SELECT COUNT(*) FROM chunks c
JOIN semantic_chunks sc ON c.id = sc.id
""").fetchone()[0]
results["conflicts"].append({
"type": "table_overlap",
"tables": ["chunks", "semantic_chunks"],
"chunks_count": chunks_count,
"semantic_count": semantic_count,
"id_overlap": overlap,
"description": (
f"Both chunks ({chunks_count}) and semantic_chunks ({semantic_count}) "
f"have data. ID overlap: {overlap}. "
"This can cause confusion - binary_cascade reads from semantic_chunks "
"but SQLiteStore reads from chunks."
)
})
elif chunks_count == 0 and semantic_count > 0:
results["recommendations"].append(
"chunks table is empty but semantic_chunks has data. "
"Use cascade-index (semantic_chunks) for better semantic search."
)
elif chunks_count > 0 and semantic_count == 0:
results["recommendations"].append(
"semantic_chunks is empty. Run 'codexlens cascade-index' to enable "
"binary cascade search."
)
# 2. Check FTS tables
fts_tables = [t for t in tables if t.startswith("files_fts")]
if len(fts_tables) >= 2:
results["recommendations"].append(
f"Found {len(fts_tables)} FTS tables: {fts_tables}. "
"Dual FTS (exact + fuzzy) is properly configured."
)
return results
def analyze_method_contributions(
index_path: Path,
queries: List[str],
limit: int = 20
) -> Dict[str, Any]:
"""Analyze contribution of each retrieval method.
Runs each method independently and measures:
- Result count
- Latency
- Score distribution
- Overlap with other methods
"""
results = {
"per_query": [],
"summary": {}
}
for query in queries:
query_result = {
"query": query,
"methods": {},
"fusion_analysis": {}
}
# Run each method independently
methods = {
"fts_exact": {"fuzzy": False, "vector": False},
"fts_fuzzy": {"fuzzy": True, "vector": False},
"vector": {"fuzzy": False, "vector": True},
}
method_results: Dict[str, List[SearchResult]] = {}
for method_name, config in methods.items():
try:
engine = HybridSearchEngine()
# Set config to disable/enable specific backends
engine._config = type('obj', (object,), {
'use_fts_fallback': method_name.startswith("fts"),
'embedding_use_gpu': True,
})()
start = time.perf_counter()
if method_name == "fts_exact":
# Force FTS fallback mode with fuzzy disabled
engine.weights = DEFAULT_WEIGHTS.copy()
results_list = engine.search(
index_path, query, limit=limit,
enable_fuzzy=False, enable_vector=False, pure_vector=False
)
elif method_name == "fts_fuzzy":
engine.weights = DEFAULT_WEIGHTS.copy()
results_list = engine.search(
index_path, query, limit=limit,
enable_fuzzy=True, enable_vector=False, pure_vector=False
)
elif method_name == "vector":
results_list = engine.search(
index_path, query, limit=limit,
enable_fuzzy=False, enable_vector=True, pure_vector=True
)
else:
results_list = []
latency = (time.perf_counter() - start) * 1000
method_results[method_name] = results_list
scores = [r.score for r in results_list]
query_result["methods"][method_name] = {
"count": len(results_list),
"latency_ms": latency,
"avg_score": sum(scores) / len(scores) if scores else 0,
"max_score": max(scores) if scores else 0,
"min_score": min(scores) if scores else 0,
"top_3_files": [r.path.split("\\")[-1] for r in results_list[:3]]
}
except Exception as e:
query_result["methods"][method_name] = {
"error": str(e),
"count": 0
}
# Compute overlap between methods
method_paths = {
name: set(r.path for r in results)
for name, results in method_results.items()
if results
}
overlaps = {}
method_names = list(method_paths.keys())
for i, m1 in enumerate(method_names):
for m2 in method_names[i+1:]:
overlap = len(method_paths[m1] & method_paths[m2])
union = len(method_paths[m1] | method_paths[m2])
jaccard = overlap / union if union > 0 else 0
overlaps[f"{m1}_vs_{m2}"] = {
"overlap_count": overlap,
"jaccard": jaccard,
f"{m1}_unique": len(method_paths[m1] - method_paths[m2]),
f"{m2}_unique": len(method_paths[m2] - method_paths[m1]),
}
query_result["overlaps"] = overlaps
# Analyze RRF fusion contribution
if len(method_results) >= 2:
# Compute RRF with each method's contribution
rrf_map = {}
for name, results in method_results.items():
if results and name in ["fts_exact", "vector"]:
# Rename for RRF
rrf_name = name.replace("fts_exact", "exact")
rrf_map[rrf_name] = results
if rrf_map:
fused = reciprocal_rank_fusion(rrf_map, k=60)
# Analyze which methods contributed to top results
source_contributions = defaultdict(int)
for r in fused[:10]:
source_ranks = r.metadata.get("source_ranks", {})
for source in source_ranks:
source_contributions[source] += 1
query_result["fusion_analysis"] = {
"total_fused": len(fused),
"top_10_source_distribution": dict(source_contributions)
}
results["per_query"].append(query_result)
# Compute summary statistics
method_stats = defaultdict(lambda: {"counts": [], "latencies": []})
for qr in results["per_query"]:
for method, data in qr["methods"].items():
if "count" in data:
method_stats[method]["counts"].append(data["count"])
if "latency_ms" in data:
method_stats[method]["latencies"].append(data["latency_ms"])
results["summary"] = {
method: {
"avg_count": sum(s["counts"]) / len(s["counts"]) if s["counts"] else 0,
"avg_latency_ms": sum(s["latencies"]) / len(s["latencies"]) if s["latencies"] else 0,
}
for method, s in method_stats.items()
}
return results
def experiment_fts_rerank_fusion(
index_path: Path,
queries: List[str],
limit: int = 10,
coarse_k: int = 50
) -> Dict[str, Any]:
"""Experiment: FTS + Rerank fusion vs standard hybrid.
Compares:
1. Standard Hybrid (FTS + Vector RRF)
2. FTS + CrossEncoder Rerank -> then fuse with Vector
"""
results = {
"per_query": [],
"summary": {}
}
# Initialize reranker
try:
from codexlens.semantic.reranker import get_reranker, check_reranker_available
ok, _ = check_reranker_available("onnx")
if ok:
reranker = get_reranker(backend="onnx", use_gpu=True)
else:
reranker = None
except Exception as e:
print(f"Reranker unavailable: {e}")
reranker = None
for query in queries:
query_result = {
"query": query,
"strategies": {}
}
# Strategy 1: Standard Hybrid (FTS + Vector)
try:
engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS)
engine._config = type('obj', (object,), {
'use_fts_fallback': False,
'embedding_use_gpu': True,
})()
start = time.perf_counter()
standard_results = engine.search(
index_path, query, limit=limit,
enable_vector=True
)
standard_latency = (time.perf_counter() - start) * 1000
query_result["strategies"]["standard_hybrid"] = {
"count": len(standard_results),
"latency_ms": standard_latency,
"top_5": [r.path.split("\\")[-1] for r in standard_results[:5]],
"scores": [r.score for r in standard_results[:5]]
}
except Exception as e:
query_result["strategies"]["standard_hybrid"] = {"error": str(e)}
# Strategy 2: FTS + Rerank -> Fuse with Vector
try:
# Step 1: Get FTS results (coarse)
fts_engine = HybridSearchEngine(weights=DEFAULT_WEIGHTS)
fts_engine._config = type('obj', (object,), {
'use_fts_fallback': True,
'embedding_use_gpu': True,
})()
start = time.perf_counter()
fts_results = fts_engine.search(
index_path, query, limit=coarse_k,
enable_fuzzy=True, enable_vector=False
)
fts_latency = (time.perf_counter() - start) * 1000
# Step 2: Rerank FTS results with CrossEncoder
if reranker and fts_results:
rerank_start = time.perf_counter()
reranked_fts = cross_encoder_rerank(
query, fts_results, reranker, top_k=20
)
rerank_latency = (time.perf_counter() - rerank_start) * 1000
else:
reranked_fts = fts_results[:20]
rerank_latency = 0
# Step 3: Get Vector results
vector_engine = HybridSearchEngine()
vector_results = vector_engine.search(
index_path, query, limit=20,
enable_vector=True, pure_vector=True
)
# Step 4: Fuse reranked FTS with Vector
if reranked_fts and vector_results:
fusion_map = {
"fts_reranked": reranked_fts,
"vector": vector_results
}
fused_results = reciprocal_rank_fusion(
fusion_map,
weights={"fts_reranked": 0.5, "vector": 0.5},
k=60
)
else:
fused_results = reranked_fts or vector_results or []
total_latency = fts_latency + rerank_latency + (time.perf_counter() - start) * 1000
query_result["strategies"]["fts_rerank_fusion"] = {
"count": len(fused_results),
"total_latency_ms": fts_latency + rerank_latency,
"fts_latency_ms": fts_latency,
"rerank_latency_ms": rerank_latency,
"top_5": [r.path.split("\\")[-1] for r in fused_results[:5]],
"scores": [r.score for r in fused_results[:5]]
}
except Exception as e:
query_result["strategies"]["fts_rerank_fusion"] = {"error": str(e)}
# Compute overlap between strategies
if (
"error" not in query_result["strategies"].get("standard_hybrid", {})
and "error" not in query_result["strategies"].get("fts_rerank_fusion", {})
):
standard_paths = set(r.path.split("\\")[-1] for r in standard_results[:10])
fts_rerank_paths = set(r.path.split("\\")[-1] for r in fused_results[:10])
overlap = len(standard_paths & fts_rerank_paths)
query_result["comparison"] = {
"top_10_overlap": overlap,
"standard_unique": list(standard_paths - fts_rerank_paths)[:3],
"fts_rerank_unique": list(fts_rerank_paths - standard_paths)[:3]
}
results["per_query"].append(query_result)
return results
def main():
"""Run all analyses."""
source_path = Path("D:/Claude_dms3/codex-lens/src")
index_path = find_project_index(source_path)
print(f"Using index: {index_path}")
print(f"Index exists: {index_path.exists()}")
print()
# Test queries
queries = [
"binary quantization",
"hamming distance search",
"embeddings generation",
"reranking algorithm",
"database connection handling",
]
# 1. Storage Architecture Analysis
print("=" * 60)
print("1. STORAGE ARCHITECTURE ANALYSIS")
print("=" * 60)
storage_analysis = analyze_storage_architecture(index_path)
print("\nTable Overview:")
for table, info in sorted(storage_analysis["tables"].items()):
if "row_count" in info:
print(f" {table}: {info['row_count']} rows")
print("\nConflicts Detected:")
for conflict in storage_analysis["conflicts"]:
print(f" - {conflict['description']}")
print("\nRecommendations:")
for rec in storage_analysis["recommendations"]:
print(f" - {rec}")
# 2. Method Contribution Analysis
print("\n" + "=" * 60)
print("2. METHOD CONTRIBUTION ANALYSIS")
print("=" * 60)
contribution_analysis = analyze_method_contributions(index_path, queries)
print("\nPer-Query Results:")
for qr in contribution_analysis["per_query"]:
print(f"\n Query: '{qr['query']}'")
for method, data in qr["methods"].items():
if "error" not in data:
print(f" {method}: {data['count']} results, {data['latency_ms']:.1f}ms")
if data.get("top_3_files"):
print(f" Top 3: {', '.join(data['top_3_files'])}")
if qr.get("overlaps"):
print(" Overlaps:")
for pair, info in qr["overlaps"].items():
print(f" {pair}: {info['overlap_count']} common (Jaccard: {info['jaccard']:.2f})")
print("\nSummary:")
for method, stats in contribution_analysis["summary"].items():
print(f" {method}: avg {stats['avg_count']:.1f} results, {stats['avg_latency_ms']:.1f}ms")
# 3. FTS + Rerank Fusion Experiment
print("\n" + "=" * 60)
print("3. FTS + RERANK FUSION EXPERIMENT")
print("=" * 60)
fusion_experiment = experiment_fts_rerank_fusion(index_path, queries)
print("\nPer-Query Comparison:")
for qr in fusion_experiment["per_query"]:
print(f"\n Query: '{qr['query']}'")
for strategy, data in qr["strategies"].items():
if "error" not in data:
latency = data.get("total_latency_ms") or data.get("latency_ms", 0)
print(f" {strategy}: {data['count']} results, {latency:.1f}ms")
if data.get("top_5"):
print(f" Top 5: {', '.join(data['top_5'][:3])}...")
if qr.get("comparison"):
comp = qr["comparison"]
print(f" Top-10 Overlap: {comp['top_10_overlap']}/10")
# Save full results
output_path = Path(__file__).parent / "results" / "method_contribution_analysis.json"
output_path.parent.mkdir(exist_ok=True)
full_results = {
"storage_analysis": storage_analysis,
"contribution_analysis": contribution_analysis,
"fusion_experiment": fusion_experiment
}
with open(output_path, "w", encoding="utf-8") as f:
json.dump(full_results, f, indent=2, default=str)
print(f"\n\nFull results saved to: {output_path}")
if __name__ == "__main__":
main()

View File

@@ -1,277 +0,0 @@
{
"timestamp": "2026-01-02 11:48:33",
"summaries": {
"binary": {
"strategy": "binary",
"total_queries": 15,
"successful_queries": 15,
"avg_latency_ms": 1133.4008666667312,
"min_latency_ms": 959.5361000028788,
"max_latency_ms": 1330.8978999993997,
"p50_latency_ms": 1125.8439999946859,
"p95_latency_ms": 1330.0081999987015,
"p99_latency_ms": 1330.71995999926,
"avg_results": 10,
"errors": []
},
"hybrid": {
"strategy": "hybrid",
"total_queries": 15,
"successful_queries": 15,
"avg_latency_ms": 1111.1401133336283,
"min_latency_ms": 857.0021999985329,
"max_latency_ms": 1278.8890000010724,
"p50_latency_ms": 1130.696000000171,
"p95_latency_ms": 1254.2417899981956,
"p99_latency_ms": 1273.959558000497,
"avg_results": 10,
"errors": []
}
},
"details": {
"binary": [
{
"strategy": "binary",
"query": "def search",
"latency_ms": 1044.525999997859,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py:0",
"error": null
},
{
"strategy": "binary",
"query": "class Engine",
"latency_ms": 1052.5979999947594,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py:0",
"error": null
},
{
"strategy": "binary",
"query": "import numpy",
"latency_ms": 1217.217100005655,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\__main__.py:0",
"error": null
},
{
"strategy": "binary",
"query": "async def",
"latency_ms": 1276.9802000038908,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py:0",
"error": null
},
{
"strategy": "binary",
"query": "raise ValueError",
"latency_ms": 1005.9053000004496,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py:0",
"error": null
},
{
"strategy": "binary",
"query": "how to parse json",
"latency_ms": 1330.8978999993997,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py:0",
"error": null
},
{
"strategy": "binary",
"query": "database connection",
"latency_ms": 1041.6685000018333,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py:0",
"error": null
},
{
"strategy": "binary",
"query": "error handling",
"latency_ms": 959.5361000028788,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_004_dual_fts.py:0",
"error": null
},
{
"strategy": "binary",
"query": "authentication logic",
"latency_ms": 1060.9395999999833,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py:0",
"error": null
},
{
"strategy": "binary",
"query": "file read write",
"latency_ms": 971.8680000005406,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py:0",
"error": null
},
{
"strategy": "binary",
"query": "embedding vector",
"latency_ms": 1135.879900000873,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\embedder.py:0",
"error": null
},
{
"strategy": "binary",
"query": "cosine similarity",
"latency_ms": 1188.1732000038028,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py:0",
"error": null
},
{
"strategy": "binary",
"query": "binary quantization",
"latency_ms": 1259.3522999959532,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py:0",
"error": null
},
{
"strategy": "binary",
"query": "hamming distance",
"latency_ms": 1329.6268999984022,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py:0",
"error": null
},
{
"strategy": "binary",
"query": "reranking",
"latency_ms": 1125.8439999946859,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py:0",
"error": null
}
],
"hybrid": [
{
"strategy": "hybrid",
"query": "def search",
"latency_ms": 1117.0937999995658,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py:0",
"error": null
},
{
"strategy": "hybrid",
"query": "class Engine",
"latency_ms": 1039.3984000038472,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py:0",
"error": null
},
{
"strategy": "hybrid",
"query": "import numpy",
"latency_ms": 1144.7916999968584,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\__main__.py:0",
"error": null
},
{
"strategy": "hybrid",
"query": "async def",
"latency_ms": 857.0021999985329,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py:0",
"error": null
},
{
"strategy": "hybrid",
"query": "raise ValueError",
"latency_ms": 957.5578000003588,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py:0",
"error": null
},
{
"strategy": "hybrid",
"query": "how to parse json",
"latency_ms": 1216.5708000029554,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py:0",
"error": null
},
{
"strategy": "hybrid",
"query": "database connection",
"latency_ms": 1154.8929000055068,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py:0",
"error": null
},
{
"strategy": "hybrid",
"query": "error handling",
"latency_ms": 1130.696000000171,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_004_dual_fts.py:0",
"error": null
},
{
"strategy": "hybrid",
"query": "authentication logic",
"latency_ms": 1112.8943000003346,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py:0",
"error": null
},
{
"strategy": "hybrid",
"query": "file read write",
"latency_ms": 1172.5986000019475,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py:0",
"error": null
},
{
"strategy": "hybrid",
"query": "embedding vector",
"latency_ms": 1278.8890000010724,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\embedder.py:0",
"error": null
},
{
"strategy": "hybrid",
"query": "cosine similarity",
"latency_ms": 1024.2393000007723,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py:0",
"error": null
},
{
"strategy": "hybrid",
"query": "binary quantization",
"latency_ms": 1243.6786999969627,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py:0",
"error": null
},
{
"strategy": "hybrid",
"query": "hamming distance",
"latency_ms": 1081.3100999948801,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py:0",
"error": null
},
{
"strategy": "hybrid",
"query": "reranking",
"latency_ms": 1135.4881000006571,
"num_results": 10,
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py:0",
"error": null
}
]
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,526 +0,0 @@
{
"timestamp": "2026-03-14 23:16:55",
"source": "D:\\Claude_dms3",
"queries_file": "D:\\Claude_dms3\\codex-lens\\benchmarks\\accuracy_queries_ccw_smart_search.jsonl",
"query_count": 4,
"k": 10,
"coarse_k": 100,
"local_only": true,
"strategies": {
"dense_rerank": {
"query_count": 4,
"hit_at_k": 0.0,
"mrr_at_k": 0.0,
"avg_recall_at_k": 0.0,
"avg_latency_ms": 20171.940174996853,
"p50_latency_ms": 14222.247749984264,
"p95_latency_ms": 35222.31535999476,
"errors": 0,
"strategy": "dense_rerank",
"stage2_mode": null
},
"staged:precomputed": {
"query_count": 4,
"hit_at_k": 0.0,
"mrr_at_k": 0.0,
"avg_recall_at_k": 0.0,
"avg_latency_ms": 13679.793299987912,
"p50_latency_ms": 12918.63379997015,
"p95_latency_ms": 16434.964765003322,
"errors": 0,
"strategy": "staged",
"stage2_mode": "precomputed"
},
"staged:realtime": {
"query_count": 4,
"hit_at_k": 0.0,
"mrr_at_k": 0.0,
"avg_recall_at_k": 0.0,
"avg_latency_ms": 13885.101849973202,
"p50_latency_ms": 13826.323699980974,
"p95_latency_ms": 14867.712269958853,
"errors": 0,
"strategy": "staged",
"stage2_mode": "realtime"
},
"staged:static_global_graph": {
"query_count": 4,
"hit_at_k": 0.0,
"mrr_at_k": 0.0,
"avg_recall_at_k": 0.0,
"avg_latency_ms": 13336.124025002122,
"p50_latency_ms": 13415.476950019598,
"p95_latency_ms": 13514.329230004549,
"errors": 0,
"strategy": "staged",
"stage2_mode": "static_global_graph"
}
},
"stage2_mode_matrix": {
"precomputed": {
"query_count": 4,
"hit_at_k": 0.0,
"mrr_at_k": 0.0,
"avg_recall_at_k": 0.0,
"avg_latency_ms": 13679.793299987912,
"p50_latency_ms": 12918.63379997015,
"p95_latency_ms": 16434.964765003322,
"errors": 0,
"strategy": "staged",
"stage2_mode": "precomputed"
},
"realtime": {
"query_count": 4,
"hit_at_k": 0.0,
"mrr_at_k": 0.0,
"avg_recall_at_k": 0.0,
"avg_latency_ms": 13885.101849973202,
"p50_latency_ms": 13826.323699980974,
"p95_latency_ms": 14867.712269958853,
"errors": 0,
"strategy": "staged",
"stage2_mode": "realtime"
},
"static_global_graph": {
"query_count": 4,
"hit_at_k": 0.0,
"mrr_at_k": 0.0,
"avg_recall_at_k": 0.0,
"avg_latency_ms": 13336.124025002122,
"p50_latency_ms": 13415.476950019598,
"p95_latency_ms": 13514.329230004549,
"errors": 0,
"strategy": "staged",
"stage2_mode": "static_global_graph"
}
},
"pairwise_stage2_deltas": [
{
"mode_a": "precomputed",
"mode_b": "realtime",
"hit_at_k_delta": 0.0,
"mrr_at_k_delta": 0.0,
"avg_recall_at_k_delta": 0.0,
"avg_latency_ms_delta": -205.30854998528957
},
{
"mode_a": "precomputed",
"mode_b": "static_global_graph",
"hit_at_k_delta": 0.0,
"mrr_at_k_delta": 0.0,
"avg_recall_at_k_delta": 0.0,
"avg_latency_ms_delta": 343.66927498579025
},
{
"mode_a": "realtime",
"mode_b": "static_global_graph",
"hit_at_k_delta": 0.0,
"mrr_at_k_delta": 0.0,
"avg_recall_at_k_delta": 0.0,
"avg_latency_ms_delta": 548.9778249710798
}
],
"config": {
"embedding_backend": "fastembed",
"embedding_model": "code",
"embedding_use_gpu": false,
"reranker_backend": "onnx",
"reranker_model": "cross-encoder/ms-marco-MiniLM-L-6-v2",
"enable_staged_rerank": true,
"enable_cross_encoder_rerank": true
},
"evaluations": [
{
"query": "executeHybridMode dense_rerank semantic smart_search",
"intent": "ccw-semantic-routing",
"notes": "CCW semantic mode delegates to CodexLens dense_rerank.",
"relevant_paths": [
"D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts"
],
"runs": {
"dense_rerank": {
"strategy_key": "dense_rerank",
"strategy": "dense_rerank",
"stage2_mode": null,
"latency_ms": 38829.27079999447,
"topk_paths": [
"d:\\claude_dms3\\ccw\\src\\core\\routes\\issue-routes.ts",
"d:\\claude_dms3\\ccw\\src\\tools\\session-manager.ts",
"d:\\claude_dms3\\ccw\\src\\types\\queue-types.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\nativesessionpanel.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts",
"d:\\claude_dms3\\ccw\\src\\core\\memory-extraction-pipeline.ts",
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\skills-page.spec.ts",
"d:\\claude_dms3\\ccw\\dist\\tools\\discover-design-files.js",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx",
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"error": null
},
"staged:precomputed": {
"strategy_key": "staged:precomputed",
"strategy": "staged",
"stage2_mode": "precomputed",
"latency_ms": 16915.833400011063,
"topk_paths": [
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts",
"d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts",
"d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js",
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"error": null
},
"staged:realtime": {
"strategy_key": "staged:realtime",
"strategy": "staged",
"stage2_mode": "realtime",
"latency_ms": 13961.2567999959,
"topk_paths": [
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts",
"d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts",
"d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js",
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"error": null
},
"staged:static_global_graph": {
"strategy_key": "staged:static_global_graph",
"strategy": "staged",
"stage2_mode": "static_global_graph",
"latency_ms": 12986.330999970436,
"topk_paths": [
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts",
"d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts",
"d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js",
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"error": null
}
}
},
{
"query": "parse CodexLens JSON output strip ANSI smart_search",
"intent": "ccw-json-fallback",
"notes": "Covers JSON/plain-text fallback handling for CodexLens output.",
"relevant_paths": [
"D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts"
],
"runs": {
"dense_rerank": {
"strategy_key": "dense_rerank",
"strategy": "dense_rerank",
"stage2_mode": null,
"latency_ms": 14782.901199996471,
"topk_paths": [
"d:\\claude_dms3\\ccw\\src\\tools\\codex-lens-lsp.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\issue\\queue\\queueexecuteinsession.tsx",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\terminal-dashboard\\queuepanel.tsx",
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\usewebsocket.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useflows.ts",
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-error-monitoring.spec.ts",
"d:\\claude_dms3\\ccw\\tests\\native-session-discovery.test.ts",
"d:\\claude_dms3\\ccw\\src\\core\\services\\checkpoint-service.ts",
"d:\\claude_dms3\\ccw\\tests\\integration\\system-routes.test.ts",
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"error": null
},
"staged:precomputed": {
"strategy_key": "staged:precomputed",
"strategy": "staged",
"stage2_mode": "precomputed",
"latency_ms": 13710.042499959469,
"topk_paths": [
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\userealtimeupdates.ts",
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-routes.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\stores\\queueexecutionstore.ts",
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\themeshare.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\clistreampanel.tsx",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\terminal-panel\\queueexecutionlistview.tsx",
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\test\\i18n.tsx",
"d:\\claude_dms3\\ccw\\dist\\core\\history-importer.js"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"error": null
},
"staged:realtime": {
"strategy_key": "staged:realtime",
"strategy": "staged",
"stage2_mode": "realtime",
"latency_ms": 15027.674999952316,
"topk_paths": [
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\userealtimeupdates.ts",
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-routes.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\stores\\queueexecutionstore.ts",
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\themeshare.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\clistreampanel.tsx",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\terminal-panel\\queueexecutionlistview.tsx",
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\test\\i18n.tsx",
"d:\\claude_dms3\\ccw\\dist\\core\\history-importer.js"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"error": null
},
"staged:static_global_graph": {
"strategy_key": "staged:static_global_graph",
"strategy": "staged",
"stage2_mode": "static_global_graph",
"latency_ms": 13389.622500002384,
"topk_paths": [
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\userealtimeupdates.ts",
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-routes.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\stores\\queueexecutionstore.ts",
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\themeshare.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\clistreampanel.tsx",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\terminal-panel\\queueexecutionlistview.tsx",
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\test\\i18n.tsx",
"d:\\claude_dms3\\ccw\\dist\\core\\history-importer.js"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"error": null
}
}
},
{
"query": "smart_search init embed search action schema",
"intent": "ccw-action-schema",
"notes": "Find the Zod schema that defines init/embed/search actions.",
"relevant_paths": [
"D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts"
],
"runs": {
"dense_rerank": {
"strategy_key": "dense_rerank",
"strategy": "dense_rerank",
"stage2_mode": null,
"latency_ms": 13661.594299972057,
"topk_paths": [
"d:\\claude_dms3\\ccw\\src\\tools\\ask-question.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\a2ui\\a2uipopupcard.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\routes\\discovery-routes.ts",
"d:\\claude_dms3\\ccw\\src\\core\\a2ui\\a2uiwebsockethandler.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\discovery.spec.ts",
"d:\\claude_dms3\\ccw\\src\\tools\\__tests__\\ask-question.test.ts",
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts",
"d:\\claude_dms3\\ccw\\dist\\core\\a2ui\\a2uiwebsockethandler.js",
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\dashboard.spec.ts"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"error": null
},
"staged:precomputed": {
"strategy_key": "staged:precomputed",
"strategy": "staged",
"stage2_mode": "precomputed",
"latency_ms": 12127.225099980831,
"topk_paths": [
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
"d:\\claude_dms3\\ccw\\src\\core\\lite-scanner-complete.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\themeselector.tsx",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\team\\teamheader.tsx",
"d:\\claude_dms3\\ccw\\src\\tools\\ask-question.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\a2ui\\a2uipopupcard.tsx",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\issue\\discovery\\findinglist.tsx",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\routes\\discovery-routes.ts",
"d:\\claude_dms3\\ccw\\src\\core\\a2ui\\a2uiwebsockethandler.ts"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"error": null
},
"staged:realtime": {
"strategy_key": "staged:realtime",
"strategy": "staged",
"stage2_mode": "realtime",
"latency_ms": 12860.084999978542,
"topk_paths": [
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
"d:\\claude_dms3\\ccw\\src\\core\\lite-scanner-complete.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\themeselector.tsx",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\team\\teamheader.tsx",
"d:\\claude_dms3\\ccw\\src\\tools\\ask-question.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\a2ui\\a2uipopupcard.tsx",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\issue\\discovery\\findinglist.tsx",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\routes\\discovery-routes.ts",
"d:\\claude_dms3\\ccw\\src\\core\\a2ui\\a2uiwebsockethandler.ts"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"error": null
},
"staged:static_global_graph": {
"strategy_key": "staged:static_global_graph",
"strategy": "staged",
"stage2_mode": "static_global_graph",
"latency_ms": 13441.331400036812,
"topk_paths": [
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
"d:\\claude_dms3\\ccw\\src\\core\\lite-scanner-complete.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\themeselector.tsx",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\team\\teamheader.tsx",
"d:\\claude_dms3\\ccw\\src\\tools\\ask-question.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\a2ui\\a2uipopupcard.tsx",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\issue\\discovery\\findinglist.tsx",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\routes\\discovery-routes.ts",
"d:\\claude_dms3\\ccw\\src\\core\\a2ui\\a2uiwebsockethandler.ts"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"error": null
}
}
},
{
"query": "auto init missing job dedupe smart_search",
"intent": "ccw-auto-init",
"notes": "Targets background init/embed warmup and dedupe state.",
"relevant_paths": [
"D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts"
],
"runs": {
"dense_rerank": {
"strategy_key": "dense_rerank",
"strategy": "dense_rerank",
"stage2_mode": null,
"latency_ms": 13413.994400024414,
"topk_paths": [
"d:\\claude_dms3\\ccw\\frontend\\src\\pages\\memorypage.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\routes\\memory-routes.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\usememory.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\batchoperationtoolbar.tsx",
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\memory.spec.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useprompthistory.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\stores\\flowstore.ts",
"d:\\claude_dms3\\ccw\\src\\services\\deepwiki-service.ts",
"d:\\claude_dms3\\ccw\\src\\core\\routes\\claude-routes.ts"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"error": null
},
"staged:precomputed": {
"strategy_key": "staged:precomputed",
"strategy": "staged",
"stage2_mode": "precomputed",
"latency_ms": 11966.072200000286,
"topk_paths": [
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\handlers.py",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\ui\\commandcombobox.tsx",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\global_graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\definition.py",
"d:\\claude_dms3\\ccw\\frontend\\src\\orchestrator\\orchestrationplanbuilder.ts",
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\lsp\\handlers.py",
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\search\\global_graph_expander.py",
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\api\\definition.py",
"d:\\claude_dms3\\ccw\\frontend\\src\\pages\\memorypage.tsx"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"error": null
},
"staged:realtime": {
"strategy_key": "staged:realtime",
"strategy": "staged",
"stage2_mode": "realtime",
"latency_ms": 13691.39059996605,
"topk_paths": [
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\handlers.py",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\ui\\commandcombobox.tsx",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\global_graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\definition.py",
"d:\\claude_dms3\\ccw\\frontend\\src\\orchestrator\\orchestrationplanbuilder.ts",
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\lsp\\handlers.py",
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\search\\global_graph_expander.py",
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\api\\definition.py",
"d:\\claude_dms3\\ccw\\frontend\\src\\pages\\memorypage.tsx"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"error": null
},
"staged:static_global_graph": {
"strategy_key": "staged:static_global_graph",
"strategy": "staged",
"stage2_mode": "static_global_graph",
"latency_ms": 13527.211199998856,
"topk_paths": [
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\handlers.py",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\ui\\commandcombobox.tsx",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\global_graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\definition.py",
"d:\\claude_dms3\\ccw\\frontend\\src\\orchestrator\\orchestrationplanbuilder.ts",
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\lsp\\handlers.py",
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\search\\global_graph_expander.py",
"d:\\claude_dms3\\codex-lens\\build\\lib\\codexlens\\api\\definition.py",
"d:\\claude_dms3\\ccw\\frontend\\src\\pages\\memorypage.tsx"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"error": null
}
}
}
]
}

View File

@@ -1,415 +0,0 @@
{
"timestamp": "2026-03-15 00:19:16",
"source": "D:\\Claude_dms3",
"queries_file": "D:\\Claude_dms3\\codex-lens\\benchmarks\\accuracy_queries_ccw_smart_search.jsonl",
"query_count": 1,
"k": 10,
"coarse_k": 100,
"local_only": true,
"strategies": {
"auto": {
"query_count": 1,
"hit_at_k": 1.0,
"mrr_at_k": 1.0,
"avg_recall_at_k": 1.0,
"avg_latency_ms": 1377.3565999865532,
"p50_latency_ms": 1377.3565999865532,
"p95_latency_ms": 1377.3565999865532,
"avg_generated_artifact_count": 0.0,
"avg_test_file_count": 0.0,
"runs_with_generated_artifacts": 0,
"runs_with_test_files": 0,
"effective_methods": {
"fts": 1
},
"errors": 0,
"strategy": "auto",
"stage2_mode": null
},
"fts": {
"query_count": 1,
"hit_at_k": 1.0,
"mrr_at_k": 1.0,
"avg_recall_at_k": 1.0,
"avg_latency_ms": 1460.0819000601768,
"p50_latency_ms": 1460.0819000601768,
"p95_latency_ms": 1460.0819000601768,
"avg_generated_artifact_count": 0.0,
"avg_test_file_count": 0.0,
"runs_with_generated_artifacts": 0,
"runs_with_test_files": 0,
"effective_methods": {
"fts": 1
},
"errors": 0,
"strategy": "fts",
"stage2_mode": null
},
"hybrid": {
"query_count": 1,
"hit_at_k": 0.0,
"mrr_at_k": 0.0,
"avg_recall_at_k": 0.0,
"avg_latency_ms": 45991.74140000343,
"p50_latency_ms": 45991.74140000343,
"p95_latency_ms": 45991.74140000343,
"avg_generated_artifact_count": 0.0,
"avg_test_file_count": 0.0,
"runs_with_generated_artifacts": 0,
"runs_with_test_files": 0,
"effective_methods": {
"hybrid": 1
},
"errors": 0,
"strategy": "hybrid",
"stage2_mode": null
},
"dense_rerank": {
"query_count": 1,
"hit_at_k": 0.0,
"mrr_at_k": 0.0,
"avg_recall_at_k": 0.0,
"avg_latency_ms": 22739.62610000372,
"p50_latency_ms": 22739.62610000372,
"p95_latency_ms": 22739.62610000372,
"avg_generated_artifact_count": 1.0,
"avg_test_file_count": 2.0,
"runs_with_generated_artifacts": 1,
"runs_with_test_files": 1,
"effective_methods": {
"dense_rerank": 1
},
"errors": 0,
"strategy": "dense_rerank",
"stage2_mode": null
},
"staged:precomputed": {
"query_count": 1,
"hit_at_k": 0.0,
"mrr_at_k": 0.0,
"avg_recall_at_k": 0.0,
"avg_latency_ms": 14900.017599999905,
"p50_latency_ms": 14900.017599999905,
"p95_latency_ms": 14900.017599999905,
"avg_generated_artifact_count": 1.0,
"avg_test_file_count": 0.0,
"runs_with_generated_artifacts": 1,
"runs_with_test_files": 0,
"effective_methods": {
"staged": 1
},
"errors": 0,
"strategy": "staged",
"stage2_mode": "precomputed"
},
"staged:realtime": {
"query_count": 1,
"hit_at_k": 0.0,
"mrr_at_k": 0.0,
"avg_recall_at_k": 0.0,
"avg_latency_ms": 14104.314599990845,
"p50_latency_ms": 14104.314599990845,
"p95_latency_ms": 14104.314599990845,
"avg_generated_artifact_count": 1.0,
"avg_test_file_count": 0.0,
"runs_with_generated_artifacts": 1,
"runs_with_test_files": 0,
"effective_methods": {
"staged": 1
},
"errors": 0,
"strategy": "staged",
"stage2_mode": "realtime"
},
"staged:static_global_graph": {
"query_count": 1,
"hit_at_k": 0.0,
"mrr_at_k": 0.0,
"avg_recall_at_k": 0.0,
"avg_latency_ms": 11906.852500021458,
"p50_latency_ms": 11906.852500021458,
"p95_latency_ms": 11906.852500021458,
"avg_generated_artifact_count": 1.0,
"avg_test_file_count": 0.0,
"runs_with_generated_artifacts": 1,
"runs_with_test_files": 0,
"effective_methods": {
"staged": 1
},
"errors": 0,
"strategy": "staged",
"stage2_mode": "static_global_graph"
}
},
"stage2_mode_matrix": {
"precomputed": {
"query_count": 1,
"hit_at_k": 0.0,
"mrr_at_k": 0.0,
"avg_recall_at_k": 0.0,
"avg_latency_ms": 14900.017599999905,
"p50_latency_ms": 14900.017599999905,
"p95_latency_ms": 14900.017599999905,
"avg_generated_artifact_count": 1.0,
"avg_test_file_count": 0.0,
"runs_with_generated_artifacts": 1,
"runs_with_test_files": 0,
"effective_methods": {
"staged": 1
},
"errors": 0,
"strategy": "staged",
"stage2_mode": "precomputed"
},
"realtime": {
"query_count": 1,
"hit_at_k": 0.0,
"mrr_at_k": 0.0,
"avg_recall_at_k": 0.0,
"avg_latency_ms": 14104.314599990845,
"p50_latency_ms": 14104.314599990845,
"p95_latency_ms": 14104.314599990845,
"avg_generated_artifact_count": 1.0,
"avg_test_file_count": 0.0,
"runs_with_generated_artifacts": 1,
"runs_with_test_files": 0,
"effective_methods": {
"staged": 1
},
"errors": 0,
"strategy": "staged",
"stage2_mode": "realtime"
},
"static_global_graph": {
"query_count": 1,
"hit_at_k": 0.0,
"mrr_at_k": 0.0,
"avg_recall_at_k": 0.0,
"avg_latency_ms": 11906.852500021458,
"p50_latency_ms": 11906.852500021458,
"p95_latency_ms": 11906.852500021458,
"avg_generated_artifact_count": 1.0,
"avg_test_file_count": 0.0,
"runs_with_generated_artifacts": 1,
"runs_with_test_files": 0,
"effective_methods": {
"staged": 1
},
"errors": 0,
"strategy": "staged",
"stage2_mode": "static_global_graph"
}
},
"pairwise_stage2_deltas": [
{
"mode_a": "precomputed",
"mode_b": "realtime",
"hit_at_k_delta": 0.0,
"mrr_at_k_delta": 0.0,
"avg_recall_at_k_delta": 0.0,
"avg_latency_ms_delta": 795.7030000090599
},
{
"mode_a": "precomputed",
"mode_b": "static_global_graph",
"hit_at_k_delta": 0.0,
"mrr_at_k_delta": 0.0,
"avg_recall_at_k_delta": 0.0,
"avg_latency_ms_delta": 2993.165099978447
},
{
"mode_a": "realtime",
"mode_b": "static_global_graph",
"hit_at_k_delta": 0.0,
"mrr_at_k_delta": 0.0,
"avg_recall_at_k_delta": 0.0,
"avg_latency_ms_delta": 2197.462099969387
}
],
"config": {
"embedding_backend": "fastembed",
"embedding_model": "code",
"embedding_use_gpu": false,
"reranker_backend": "onnx",
"reranker_model": "cross-encoder/ms-marco-MiniLM-L-6-v2",
"reranker_use_gpu": false,
"enable_staged_rerank": true,
"enable_cross_encoder_rerank": true
},
"evaluations": [
{
"query": "executeHybridMode dense_rerank semantic smart_search",
"intent": "ccw-semantic-routing",
"notes": "CCW semantic mode delegates to CodexLens dense_rerank.",
"relevant_paths": [
"D:\\Claude_dms3\\ccw\\src\\tools\\smart-search.ts"
],
"runs": {
"auto": {
"strategy_key": "auto",
"strategy": "auto",
"stage2_mode": null,
"effective_method": "fts",
"execution_method": "fts",
"latency_ms": 1377.3565999865532,
"topk_paths": [
"d:\\claude_dms3\\ccw\\src\\tools\\smart-search.ts"
],
"first_hit_rank": 1,
"hit_at_k": true,
"recall_at_k": 1.0,
"generated_artifact_count": 0,
"test_file_count": 0,
"error": null
},
"fts": {
"strategy_key": "fts",
"strategy": "fts",
"stage2_mode": null,
"effective_method": "fts",
"execution_method": "fts",
"latency_ms": 1460.0819000601768,
"topk_paths": [
"d:\\claude_dms3\\ccw\\src\\tools\\smart-search.ts"
],
"first_hit_rank": 1,
"hit_at_k": true,
"recall_at_k": 1.0,
"generated_artifact_count": 0,
"test_file_count": 0,
"error": null
},
"hybrid": {
"strategy_key": "hybrid",
"strategy": "hybrid",
"stage2_mode": null,
"effective_method": "hybrid",
"execution_method": "hybrid",
"latency_ms": 45991.74140000343,
"topk_paths": [
"d:\\claude_dms3\\ccw\\src\\config\\litellm-api-config-manager.ts",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py",
"d:\\claude_dms3\\ccw\\src\\commands\\core-memory.ts",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\scripts\\generate_embeddings.py",
"d:\\claude_dms3\\ccw\\src\\core\\routes\\notification-routes.ts",
"d:\\claude_dms3\\ccw\\src\\tools\\team-msg.ts",
"d:\\claude_dms3\\ccw\\src\\types\\remote-notification.ts",
"d:\\claude_dms3\\ccw\\src\\core\\memory-store.ts",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"generated_artifact_count": 0,
"test_file_count": 0,
"error": null
},
"dense_rerank": {
"strategy_key": "dense_rerank",
"strategy": "dense_rerank",
"stage2_mode": null,
"effective_method": "dense_rerank",
"execution_method": "cascade",
"latency_ms": 22739.62610000372,
"topk_paths": [
"d:\\claude_dms3\\ccw\\src\\core\\routes\\issue-routes.ts",
"d:\\claude_dms3\\ccw\\src\\tools\\session-manager.ts",
"d:\\claude_dms3\\ccw\\src\\types\\queue-types.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\nativesessionpanel.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts",
"d:\\claude_dms3\\ccw\\src\\core\\memory-extraction-pipeline.ts",
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\skills-page.spec.ts",
"d:\\claude_dms3\\ccw\\dist\\tools\\discover-design-files.js",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\api-settings\\clisettingsmodal.tsx",
"d:\\claude_dms3\\ccw\\frontend\\tests\\e2e\\api-settings.spec.ts"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"generated_artifact_count": 1,
"test_file_count": 2,
"error": null
},
"staged:precomputed": {
"strategy_key": "staged:precomputed",
"strategy": "staged",
"stage2_mode": "precomputed",
"effective_method": "staged",
"execution_method": "cascade",
"latency_ms": 14900.017599999905,
"topk_paths": [
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts",
"d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts",
"d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js",
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"generated_artifact_count": 1,
"test_file_count": 0,
"error": null
},
"staged:realtime": {
"strategy_key": "staged:realtime",
"strategy": "staged",
"stage2_mode": "realtime",
"effective_method": "staged",
"execution_method": "cascade",
"latency_ms": 14104.314599990845,
"topk_paths": [
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts",
"d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts",
"d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js",
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"generated_artifact_count": 1,
"test_file_count": 0,
"error": null
},
"staged:static_global_graph": {
"strategy_key": "staged:static_global_graph",
"strategy": "staged",
"stage2_mode": "static_global_graph",
"effective_method": "staged",
"execution_method": "cascade",
"latency_ms": 11906.852500021458,
"topk_paths": [
"d:\\claude_dms3\\ccw\\src\\tools\\native-session-discovery.ts",
"d:\\claude_dms3\\ccw\\src\\commands\\memory.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\hooks\\useissues.test.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\routes\\cli-sessions-routes.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\lib\\api.ts",
"d:\\claude_dms3\\ccw\\frontend\\src\\components\\shared\\filepreview.tsx",
"d:\\claude_dms3\\ccw\\src\\core\\hooks\\hook-templates.ts",
"d:\\claude_dms3\\ccw\\src\\utils\\file-reader.ts",
"d:\\claude_dms3\\ccw\\dist\\core\\routes\\cli-sessions-routes.js",
"d:\\claude_dms3\\ccw\\src\\core\\history-importer.ts"
],
"first_hit_rank": null,
"hit_at_k": false,
"recall_at_k": 0.0,
"generated_artifact_count": 1,
"test_file_count": 0,
"error": null
}
}
}
]
}

View File

@@ -1,453 +0,0 @@
{
"summary": {
"timestamp": "2026-02-09 11:08:47",
"source": "src",
"k": 10,
"coarse_k": 100,
"query_count": 7,
"avg_jaccard_topk": 0.41421235160730957,
"avg_rbo_topk": 0.22899068093857142,
"staged": {
"success": 7,
"avg_latency_ms": 32009.68328570468
},
"dense_rerank": {
"success": 7,
"avg_latency_ms": 2783.3305999977247
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 40875.45489999652,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 10633.91399383545,
"stage2_expand_ms": 12487.980365753174,
"stage3_cluster_ms": 10781.587362289429,
"stage4_rerank_ms": 6914.837837219238
},
"stage_counts": {
"stage1_candidates": 100,
"stage2_expanded": 149,
"stage3_clustered": 20,
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 3111.874899983406,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.06741929885142856,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 8,
"dense_unique_dirs_topk": 4
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 38541.18510001898,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 548.8920211791992,
"stage2_expand_ms": 27176.724433898926,
"stage3_cluster_ms": 8352.917671203613,
"stage4_rerank_ms": 2392.6541805267334
},
"stage_counts": {
"stage1_candidates": 100,
"stage2_expanded": 101,
"stage3_clustered": 20,
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 2652.75,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.26666666666666666,
"rbo_topk": 0.2983708721671428,
"staged_unique_files_topk": 9,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 26319.983999997377,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\merkle_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 514.4834518432617,
"stage2_expand_ms": 14329.241514205933,
"stage3_cluster_ms": 9249.040842056274,
"stage4_rerank_ms": 2159.9059104919434
},
"stage_counts": {
"stage1_candidates": 100,
"stage2_expanded": 100,
"stage3_clustered": 20,
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 2666.9745999872684,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.6666666666666666,
"rbo_topk": 0.3571430355128571,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "graph expansion",
"staged": {
"strategy": "staged",
"query": "graph expansion",
"latency_ms": 25696.087299972773,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 560.4684352874756,
"stage2_expand_ms": 13951.441526412964,
"stage3_cluster_ms": 8879.387140274048,
"stage4_rerank_ms": 2229.4514179229736
},
"stage_counts": {
"stage1_candidates": 100,
"stage2_expanded": 100,
"stage3_clustered": 20,
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "graph expansion",
"latency_ms": 2544.8630999922752,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.42857142857142855,
"rbo_topk": 0.13728894791142857,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "clustering strategy",
"staged": {
"strategy": "staged",
"query": "clustering strategy",
"latency_ms": 27387.41929998994,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 625.0262260437012,
"stage2_expand_ms": 14211.347103118896,
"stage3_cluster_ms": 10269.58680152893,
"stage4_rerank_ms": 2208.007335662842
},
"stage_counts": {
"stage1_candidates": 100,
"stage2_expanded": 100,
"stage3_clustered": 20,
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "clustering strategy",
"latency_ms": 2928.22389999032,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.17647058823529413,
"rbo_topk": 0.07116480920571429,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "error handling",
"staged": {
"strategy": "staged",
"query": "error handling",
"latency_ms": 23732.33979997039,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 504.0884017944336,
"stage2_expand_ms": 12899.415016174316,
"stage3_cluster_ms": 7881.027936935425,
"stage4_rerank_ms": 2372.1535205841064
},
"stage_counts": {
"stage1_candidates": 100,
"stage2_expanded": 100,
"stage3_clustered": 20,
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "error handling",
"latency_ms": 2946.439900010824,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.6666666666666666,
"rbo_topk": 0.19158624676285715,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "how to parse json",
"staged": {
"strategy": "staged",
"query": "how to parse json",
"latency_ms": 41515.31259998679,
"num_results": 9,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 601.7005443572998,
"stage2_expand_ms": 30052.319765090942,
"stage3_cluster_ms": 8409.791231155396,
"stage4_rerank_ms": 2371.1729049682617
},
"stage_counts": {
"stage1_candidates": 100,
"stage2_expanded": 100,
"stage3_clustered": 20,
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "how to parse json",
"latency_ms": 2632.1878000199795,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.5833333333333334,
"rbo_topk": 0.4799615561585714,
"staged_unique_files_topk": 9,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
}
]
}

View File

@@ -1,356 +0,0 @@
{
"summary": {
"timestamp": "2026-02-09 20:37:28",
"source": "src",
"k": 10,
"coarse_k": 100,
"query_count": 7,
"avg_jaccard_topk": 0.12095811211246858,
"avg_rbo_topk": 0.09594444061244897,
"staged": {
"success": 7,
"avg_latency_ms": 2471.239057132176
},
"dense_rerank": {
"success": 7,
"avg_latency_ms": 3087.217985710927
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 312.2674999535084,
"num_results": 37,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\litellm_reranker.py"
],
"stage_stats": null,
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 2672.6916999816895,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.05263157894736842,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 15344.861499994993,
"num_results": 3,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 81.70747756958008,
"stage2_expand_ms": 12762.907266616821,
"stage3_cluster_ms": 0.0021457672119140625,
"stage4_rerank_ms": 2422.7287769317627
},
"stage_counts": {
"stage1_candidates": 3,
"stage2_expanded": 4,
"stage2_unique_paths": 3,
"stage2_duplicate_paths": 1,
"stage3_clustered": 4,
"stage3_strategy": "dir_rr",
"stage4_reranked": 4
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 2908.5530000030994,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.09090909090909091,
"rbo_topk": 0.23541639942571424,
"staged_unique_files_topk": 2,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 328.4989999830723,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py"
],
"stage_stats": null,
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 3426.8526000082493,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "graph expansion",
"staged": {
"strategy": "staged",
"query": "graph expansion",
"latency_ms": 359.32230001688004,
"num_results": 11,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\hybrid_search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py"
],
"stage_stats": null,
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "graph expansion",
"latency_ms": 3472.025099992752,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.17647058823529413,
"rbo_topk": 0.06801300374142856,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 7,
"dense_unique_dirs_topk": 4
},
{
"query": "clustering strategy",
"staged": {
"strategy": "staged",
"query": "clustering strategy",
"latency_ms": 289.3139999806881,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py"
],
"stage_stats": null,
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "clustering strategy",
"latency_ms": 2859.5299999713898,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.04670528456571428,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 4
},
{
"query": "error handling",
"staged": {
"strategy": "staged",
"query": "error handling",
"latency_ms": 305.66699999570847,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
],
"stage_stats": null,
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "error handling",
"latency_ms": 3101.3711999952793,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "how to parse json",
"staged": {
"strategy": "staged",
"query": "how to parse json",
"latency_ms": 358.74210000038147,
"num_results": 4,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
],
"stage_stats": null,
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "how to parse json",
"latency_ms": 3169.5023000240326,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.2727272727272727,
"rbo_topk": 0.18590219827714285,
"staged_unique_files_topk": 4,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 4
}
]
}

View File

@@ -1,466 +0,0 @@
{
"summary": {
"timestamp": "2026-02-09 20:48:55",
"source": "src",
"k": 10,
"coarse_k": 100,
"query_count": 7,
"avg_jaccard_topk": 0.11418494830148965,
"avg_rbo_topk": 0.08910725003591835,
"staged": {
"success": 7,
"avg_latency_ms": 16443.109000005894
},
"dense_rerank": {
"success": 7,
"avg_latency_ms": 2919.481471432107
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 6056.956700026989,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 113.12270164489746,
"stage1_fallback_search_ms": 262.55249977111816,
"stage2_expand_ms": 3022.8426456451416,
"stage3_cluster_ms": 1.155853271484375,
"stage4_rerank_ms": 2554.953098297119
},
"stage_counts": {
"stage1_candidates": 37,
"stage1_fallback_used": 1,
"stage2_expanded": 86,
"stage2_unique_paths": 53,
"stage2_duplicate_paths": 33,
"stage3_clustered": 20,
"stage3_strategy": "dir_rr",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 2788.0383999943733,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.05263157894736842,
"rbo_topk": 0.014635885139999999,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 8,
"dense_unique_dirs_topk": 4
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 12229.477500021458,
"num_results": 3,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 108.82282257080078,
"stage2_expand_ms": 9422.304153442383,
"stage3_cluster_ms": 0.001430511474609375,
"stage4_rerank_ms": 2611.234664916992
},
"stage_counts": {
"stage1_candidates": 3,
"stage2_expanded": 4,
"stage2_unique_paths": 3,
"stage2_duplicate_paths": 1,
"stage3_clustered": 4,
"stage3_strategy": "dir_rr",
"stage4_reranked": 4
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 2823.377499997616,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.09090909090909091,
"rbo_topk": 0.23541639942571424,
"staged_unique_files_topk": 2,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 33805.434699982405,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 100.5556583404541,
"stage1_fallback_search_ms": 176.71489715576172,
"stage2_expand_ms": 31017.661809921265,
"stage3_cluster_ms": 0.001430511474609375,
"stage4_rerank_ms": 2403.3148288726807
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 5,
"stage2_unique_paths": 5,
"stage2_duplicate_paths": 0,
"stage3_clustered": 5,
"stage3_strategy": "dir_rr",
"stage4_reranked": 5
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 2906.127400010824,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "graph expansion",
"staged": {
"strategy": "staged",
"query": "graph expansion",
"latency_ms": 16790.213800013065,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\hybrid_search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 110.00967025756836,
"stage1_fallback_search_ms": 176.9556999206543,
"stage2_expand_ms": 13929.782629013062,
"stage3_cluster_ms": 0.45800209045410156,
"stage4_rerank_ms": 2486.6883754730225
},
"stage_counts": {
"stage1_candidates": 11,
"stage1_fallback_used": 1,
"stage2_expanded": 29,
"stage2_unique_paths": 14,
"stage2_duplicate_paths": 15,
"stage3_clustered": 20,
"stage3_strategy": "dir_rr",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "graph expansion",
"latency_ms": 2866.819000005722,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1875,
"rbo_topk": 0.06893318399142857,
"staged_unique_files_topk": 9,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 8,
"dense_unique_dirs_topk": 4
},
{
"query": "clustering strategy",
"staged": {
"strategy": "staged",
"query": "clustering strategy",
"latency_ms": 9090.759900003672,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 85.28780937194824,
"stage1_fallback_search_ms": 183.7012767791748,
"stage2_expand_ms": 5557.527780532837,
"stage3_cluster_ms": 0.001430511474609375,
"stage4_rerank_ms": 3164.6268367767334
},
"stage_counts": {
"stage1_candidates": 10,
"stage1_fallback_used": 1,
"stage2_expanded": 10,
"stage2_unique_paths": 10,
"stage2_duplicate_paths": 0,
"stage3_clustered": 10,
"stage3_strategy": "dir_rr",
"stage4_reranked": 10
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "clustering strategy",
"latency_ms": 3062.4616000056267,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.04670528456571428,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 4
},
{
"query": "error handling",
"staged": {
"strategy": "staged",
"query": "error handling",
"latency_ms": 19777.87659996748,
"num_results": 6,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 65.9482479095459,
"stage1_fallback_search_ms": 181.9770336151123,
"stage2_expand_ms": 16960.813760757446,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 2472.1477031707764
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 13,
"stage2_unique_paths": 6,
"stage2_duplicate_paths": 7,
"stage3_clustered": 13,
"stage3_strategy": "dir_rr",
"stage4_reranked": 13
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "error handling",
"latency_ms": 2854.169200003147,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "how to parse json",
"staged": {
"strategy": "staged",
"query": "how to parse json",
"latency_ms": 17351.04380002618,
"num_results": 7,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 119.1408634185791,
"stage1_fallback_search_ms": 246.2625503540039,
"stage2_expand_ms": 14137.234449386597,
"stage3_cluster_ms": 0.0011920928955078125,
"stage4_rerank_ms": 2750.417470932007
},
"stage_counts": {
"stage1_candidates": 4,
"stage1_fallback_used": 1,
"stage2_expanded": 11,
"stage2_unique_paths": 7,
"stage2_duplicate_paths": 4,
"stage3_clustered": 11,
"stage3_strategy": "dir_rr",
"stage4_reranked": 11
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "how to parse json",
"latency_ms": 3135.3772000074387,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.21428571428571427,
"rbo_topk": 0.16767719827714284,
"staged_unique_files_topk": 7,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
}
]
}

View File

@@ -1,467 +0,0 @@
{
"summary": {
"timestamp": "2026-02-09 20:56:02",
"source": "src",
"k": 10,
"coarse_k": 100,
"query_count": 7,
"avg_jaccard_topk": 0.11350467619264612,
"avg_rbo_topk": 0.09062624799510204,
"staged": {
"success": 7,
"avg_latency_ms": 8679.35167142323
},
"dense_rerank": {
"success": 7,
"avg_latency_ms": 3097.294714289052
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 6814.465099990368,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 85.55030822753906,
"stage1_fallback_search_ms": 197.95989990234375,
"stage2_expand_ms": 3032.4549674987793,
"stage3_cluster_ms": 1.1937618255615234,
"stage4_rerank_ms": 3402.9476642608643
},
"stage_counts": {
"stage1_candidates": 37,
"stage1_fallback_used": 1,
"stage2_expanded": 86,
"stage2_unique_paths": 53,
"stage2_duplicate_paths": 33,
"stage3_clustered": 20,
"stage3_strategy": "dir_rr",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 3175.0339000225067,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.05263157894736842,
"rbo_topk": 0.014635885139999999,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 8,
"dense_unique_dirs_topk": 4
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 8990.238099992275,
"num_results": 3,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 90.6367301940918,
"stage2_expand_ms": 6272.260665893555,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 2531.4290523529053
},
"stage_counts": {
"stage1_candidates": 3,
"stage2_expanded": 4,
"stage2_unique_paths": 3,
"stage2_duplicate_paths": 1,
"stage3_clustered": 4,
"stage3_strategy": "dir_rr",
"stage4_reranked": 4
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 3434.4095999896526,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.09090909090909091,
"rbo_topk": 0.23541639942571424,
"staged_unique_files_topk": 2,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 9296.205000013113,
"num_results": 7,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 86.64774894714355,
"stage1_fallback_search_ms": 163.8650894165039,
"stage2_expand_ms": 6144.1497802734375,
"stage3_cluster_ms": 0.4100799560546875,
"stage4_rerank_ms": 2807.274580001831
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 31,
"stage2_unique_paths": 11,
"stage2_duplicate_paths": 20,
"stage3_clustered": 20,
"stage3_strategy": "dir_rr",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 3043.4417999982834,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.06666666666666667,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 6,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "graph expansion",
"staged": {
"strategy": "staged",
"query": "graph expansion",
"latency_ms": 9086.15110000968,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 72.22437858581543,
"stage1_fallback_search_ms": 166.3804054260254,
"stage2_expand_ms": 6179.303169250488,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 2575.9027004241943
},
"stage_counts": {
"stage1_candidates": 11,
"stage1_fallback_used": 1,
"stage2_expanded": 16,
"stage2_unique_paths": 13,
"stage2_duplicate_paths": 3,
"stage3_clustered": 16,
"stage3_strategy": "dir_rr",
"stage4_reranked": 16
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "graph expansion",
"latency_ms": 2793.8257000148296,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1875,
"rbo_topk": 0.06134116970571428,
"staged_unique_files_topk": 9,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 7,
"dense_unique_dirs_topk": 4
},
{
"query": "clustering strategy",
"staged": {
"strategy": "staged",
"query": "clustering strategy",
"latency_ms": 8401.927499979734,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 72.67880439758301,
"stage1_fallback_search_ms": 166.71442985534668,
"stage2_expand_ms": 5561.89489364624,
"stage3_cluster_ms": 0.0007152557373046875,
"stage4_rerank_ms": 2517.7178382873535
},
"stage_counts": {
"stage1_candidates": 10,
"stage1_fallback_used": 1,
"stage2_expanded": 10,
"stage2_unique_paths": 10,
"stage2_duplicate_paths": 0,
"stage3_clustered": 10,
"stage3_strategy": "dir_rr",
"stage4_reranked": 10
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "clustering strategy",
"latency_ms": 3192.0045999884605,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.04670528456571428,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 4
},
{
"query": "error handling",
"staged": {
"strategy": "staged",
"query": "error handling",
"latency_ms": 9032.269400000572,
"num_results": 6,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 78.59635353088379,
"stage1_fallback_search_ms": 180.96280097961426,
"stage2_expand_ms": 6175.840377807617,
"stage3_cluster_ms": 0.001430511474609375,
"stage4_rerank_ms": 2503.4260749816895
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 13,
"stage2_unique_paths": 6,
"stage2_duplicate_paths": 7,
"stage3_clustered": 13,
"stage3_strategy": "dir_rr",
"stage4_reranked": 13
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "error handling",
"latency_ms": 3076.744800001383,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "how to parse json",
"staged": {
"strategy": "staged",
"query": "how to parse json",
"latency_ms": 9134.205499976873,
"num_results": 7,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 117.79379844665527,
"stage1_fallback_search_ms": 187.53886222839355,
"stage2_expand_ms": 6218.849658966064,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 2515.6633853912354
},
"stage_counts": {
"stage1_candidates": 4,
"stage1_fallback_used": 1,
"stage2_expanded": 9,
"stage2_unique_paths": 7,
"stage2_duplicate_paths": 2,
"stage3_clustered": 9,
"stage3_strategy": "dir_rr",
"stage4_reranked": 9
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "how to parse json",
"latency_ms": 2965.6026000082493,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.21428571428571427,
"rbo_topk": 0.18590219827714285,
"staged_unique_files_topk": 7,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
}
]
}

View File

@@ -1,171 +0,0 @@
{
"summary": {
"timestamp": "2026-02-09 19:16:45",
"source": "src",
"k": 10,
"coarse_k": 100,
"query_count": 3,
"avg_jaccard_topk": 0.07165641376167692,
"avg_rbo_topk": 0.10859973275904759,
"staged": {
"success": 3,
"avg_latency_ms": 7919.317766676347
},
"dense_rerank": {
"success": 3,
"avg_latency_ms": 2812.574933330218
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 6351.961700022221,
"num_results": 37,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\litellm_reranker.py"
],
"stage_stats": null,
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 4424.698300004005,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.05263157894736842,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 17239.81479999423,
"num_results": 3,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 18.40996742248535,
"stage2_expand_ms": 16024.681329727173,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 1160.1319313049316
},
"stage_counts": {
"stage1_candidates": 3,
"stage2_expanded": 4,
"stage2_unique_paths": 3,
"stage2_duplicate_paths": 1,
"stage3_clustered": 4,
"stage3_strategy": "score",
"stage4_reranked": 4
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 2086.8772999942303,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.09090909090909091,
"rbo_topk": 0.23541639942571424,
"staged_unique_files_topk": 2,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 166.1768000125885,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py"
],
"stage_stats": null,
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 1926.1491999924183,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
}
]
}

View File

@@ -1,171 +0,0 @@
{
"summary": {
"timestamp": "2026-02-09 19:19:13",
"source": "src",
"k": 10,
"coarse_k": 100,
"query_count": 3,
"avg_jaccard_topk": 0.07165641376167692,
"avg_rbo_topk": 0.10859973275904759,
"staged": {
"success": 3,
"avg_latency_ms": 8272.264699995518
},
"dense_rerank": {
"success": 3,
"avg_latency_ms": 2753.5123999913535
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 6453.665100008249,
"num_results": 37,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\litellm_reranker.py"
],
"stage_stats": null,
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 4530.146999955177,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.05263157894736842,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 18202.905599981546,
"num_results": 3,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 15.580177307128906,
"stage2_expand_ms": 16622.225522994995,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 1516.9692039489746
},
"stage_counts": {
"stage1_candidates": 3,
"stage2_expanded": 4,
"stage2_unique_paths": 3,
"stage2_duplicate_paths": 1,
"stage3_clustered": 4,
"stage3_strategy": "score",
"stage4_reranked": 4
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 1746.9925000071526,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.09090909090909091,
"rbo_topk": 0.23541639942571424,
"staged_unique_files_topk": 2,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 160.2233999967575,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py"
],
"stage_stats": null,
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 1983.3977000117302,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
}
]
}

View File

@@ -1,453 +0,0 @@
{
"summary": {
"timestamp": "2026-02-09 11:26:54",
"source": "src",
"k": 10,
"coarse_k": 100,
"query_count": 7,
"avg_jaccard_topk": 0.39589733329229126,
"avg_rbo_topk": 0.23139636799510202,
"staged": {
"success": 7,
"avg_latency_ms": 32194.107242865222
},
"dense_rerank": {
"success": 7,
"avg_latency_ms": 2643.366857132741
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 43041.41250002384,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 9864.638805389404,
"stage2_expand_ms": 13012.29190826416,
"stage3_cluster_ms": 13297.565460205078,
"stage4_rerank_ms": 6821.892261505127
},
"stage_counts": {
"stage1_candidates": 100,
"stage2_expanded": 149,
"stage3_clustered": 20,
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 3209.129799991846,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.05429729885142857,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 8,
"dense_unique_dirs_topk": 4
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 37827.209600031376,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 531.8794250488281,
"stage2_expand_ms": 27009.481191635132,
"stage3_cluster_ms": 7948.509931564331,
"stage4_rerank_ms": 2268.9380645751953
},
"stage_counts": {
"stage1_candidates": 100,
"stage2_expanded": 101,
"stage3_clustered": 20,
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 2540.472400009632,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.26666666666666666,
"rbo_topk": 0.2983708721671428,
"staged_unique_files_topk": 9,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 24744.686599999666,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\merkle_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 517.8542137145996,
"stage2_expand_ms": 12839.622735977173,
"stage3_cluster_ms": 9154.959678649902,
"stage4_rerank_ms": 2160.0701808929443
},
"stage_counts": {
"stage1_candidates": 100,
"stage2_expanded": 100,
"stage3_clustered": 20,
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 2482.5908999741077,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.5384615384615384,
"rbo_topk": 0.36639083062285716,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "graph expansion",
"staged": {
"strategy": "staged",
"query": "graph expansion",
"latency_ms": 25239.59050002694,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 631.9081783294678,
"stage2_expand_ms": 12570.756196975708,
"stage3_cluster_ms": 9557.724952697754,
"stage4_rerank_ms": 2409.7683429718018
},
"stage_counts": {
"stage1_candidates": 100,
"stage2_expanded": 100,
"stage3_clustered": 20,
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "graph expansion",
"latency_ms": 2574.1938000023365,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.42857142857142855,
"rbo_topk": 0.13728894791142857,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "clustering strategy",
"staged": {
"strategy": "staged",
"query": "clustering strategy",
"latency_ms": 28572.93939998746,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 659.6193313598633,
"stage2_expand_ms": 14207.426309585571,
"stage3_cluster_ms": 11513.370037078857,
"stage4_rerank_ms": 2117.546319961548
},
"stage_counts": {
"stage1_candidates": 100,
"stage2_expanded": 100,
"stage3_clustered": 20,
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "clustering strategy",
"latency_ms": 2536.551799982786,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.17647058823529413,
"rbo_topk": 0.07116480920571429,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "error handling",
"staged": {
"strategy": "staged",
"query": "error handling",
"latency_ms": 23812.726000010967,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 475.42428970336914,
"stage2_expand_ms": 12454.935789108276,
"stage3_cluster_ms": 8576.019525527954,
"stage4_rerank_ms": 2265.360116958618
},
"stage_counts": {
"stage1_candidates": 100,
"stage2_expanded": 100,
"stage3_clustered": 20,
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "error handling",
"latency_ms": 2648.7773999869823,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.6666666666666666,
"rbo_topk": 0.21230026104857144,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "how to parse json",
"staged": {
"strategy": "staged",
"query": "how to parse json",
"latency_ms": 42120.1860999763,
"num_results": 9,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 570.8920955657959,
"stage2_expand_ms": 30054.06880378723,
"stage3_cluster_ms": 9285.51697731018,
"stage4_rerank_ms": 2142.771005630493
},
"stage_counts": {
"stage1_candidates": 100,
"stage2_expanded": 100,
"stage3_clustered": 20,
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "how to parse json",
"latency_ms": 2511.8518999814987,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.5833333333333334,
"rbo_topk": 0.4799615561585714,
"staged_unique_files_topk": 9,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
}
]
}

View File

@@ -1,208 +0,0 @@
{
"summary": {
"timestamp": "2026-02-09 17:27:26",
"source": "src",
"k": 10,
"coarse_k": 100,
"query_count": 3,
"avg_jaccard_topk": 0.5809523809523809,
"avg_rbo_topk": 0.31359567182809517,
"staged": {
"success": 3,
"avg_latency_ms": 22826.711433331173
},
"dense_rerank": {
"success": 3,
"avg_latency_ms": 2239.804533312718
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 26690.878500014544,
"num_results": 6,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 8534.121036529541,
"stage2_expand_ms": 13298.827648162842,
"stage3_cluster_ms": 0.026226043701171875,
"stage4_rerank_ms": 4805.774688720703
},
"stage_counts": {
"stage1_candidates": 100,
"stage2_expanded": 149,
"stage2_unique_paths": 43,
"stage2_duplicate_paths": 106,
"stage3_clustered": 20,
"stage3_strategy": "score",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 2416.653799980879,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.14285714285714285,
"rbo_topk": 0.25764429885142853,
"staged_unique_files_topk": 6,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 4
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 26188.838399976492,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 525.7587432861328,
"stage2_expand_ms": 23659.400939941406,
"stage3_cluster_ms": 0.021696090698242188,
"stage4_rerank_ms": 1928.950309753418
},
"stage_counts": {
"stage1_candidates": 100,
"stage2_expanded": 101,
"stage2_unique_paths": 23,
"stage2_duplicate_paths": 78,
"stage3_clustered": 20,
"stage3_strategy": "score",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 1953.0992999970913,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.9,
"rbo_topk": 0.39374892065285705,
"staged_unique_files_topk": 9,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 15600.41740000248,
"num_results": 7,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 475.54636001586914,
"stage2_expand_ms": 13318.811893463135,
"stage3_cluster_ms": 0.03218650817871094,
"stage4_rerank_ms": 1755.7547092437744
},
"stage_counts": {
"stage1_candidates": 100,
"stage2_expanded": 100,
"stage2_unique_paths": 21,
"stage2_duplicate_paths": 79,
"stage3_clustered": 20,
"stage3_strategy": "score",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 2349.660499960184,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.7,
"rbo_topk": 0.28939379598,
"staged_unique_files_topk": 7,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 4
}
]
}

View File

@@ -1,356 +0,0 @@
{
"summary": {
"timestamp": "2026-02-09 20:36:02",
"source": "src",
"k": 10,
"coarse_k": 100,
"query_count": 7,
"avg_jaccard_topk": 0.12095811211246858,
"avg_rbo_topk": 0.09594444061244897,
"staged": {
"success": 7,
"avg_latency_ms": 2436.7641000066483
},
"dense_rerank": {
"success": 7,
"avg_latency_ms": 2593.7630428629263
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 285.091000020504,
"num_results": 37,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\litellm_reranker.py"
],
"stage_stats": null,
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 2412.1290000081062,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.05263157894736842,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 15029.73520001769,
"num_results": 3,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 101.95636749267578,
"stage2_expand_ms": 12690.008640289307,
"stage3_cluster_ms": 0.001430511474609375,
"stage4_rerank_ms": 2155.757427215576
},
"stage_counts": {
"stage1_candidates": 3,
"stage2_expanded": 4,
"stage2_unique_paths": 3,
"stage2_duplicate_paths": 1,
"stage3_clustered": 4,
"stage3_strategy": "score",
"stage4_reranked": 4
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 2424.7003000080585,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.09090909090909091,
"rbo_topk": 0.23541639942571424,
"staged_unique_files_topk": 2,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 324.4240999817848,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py"
],
"stage_stats": null,
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 2497.174100011587,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "graph expansion",
"staged": {
"strategy": "staged",
"query": "graph expansion",
"latency_ms": 359.32159999012947,
"num_results": 11,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\hybrid_search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py"
],
"stage_stats": null,
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "graph expansion",
"latency_ms": 2553.8585999906063,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.17647058823529413,
"rbo_topk": 0.06801300374142856,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 7,
"dense_unique_dirs_topk": 4
},
{
"query": "clustering strategy",
"staged": {
"strategy": "staged",
"query": "clustering strategy",
"latency_ms": 286.38240000605583,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py"
],
"stage_stats": null,
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "clustering strategy",
"latency_ms": 2570.379099994898,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.04670528456571428,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 4
},
{
"query": "error handling",
"staged": {
"strategy": "staged",
"query": "error handling",
"latency_ms": 412.58780002593994,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
],
"stage_stats": null,
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "error handling",
"latency_ms": 2894.3279000222683,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "how to parse json",
"staged": {
"strategy": "staged",
"query": "how to parse json",
"latency_ms": 359.8066000044346,
"num_results": 4,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
],
"stage_stats": null,
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "how to parse json",
"latency_ms": 2803.772300004959,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.2727272727272727,
"rbo_topk": 0.18590219827714285,
"staged_unique_files_topk": 4,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 4
}
]
}

View File

@@ -1,462 +0,0 @@
{
"summary": {
"timestamp": "2026-02-09 20:45:10",
"source": "src",
"k": 10,
"coarse_k": 100,
"query_count": 7,
"avg_jaccard_topk": 0.1283498247783962,
"avg_rbo_topk": 0.09664773770897958,
"staged": {
"success": 7,
"avg_latency_ms": 16394.152085712976
},
"dense_rerank": {
"success": 7,
"avg_latency_ms": 2839.464457145759
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 6233.342700004578,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 125.80323219299316,
"stage1_fallback_search_ms": 277.1914005279541,
"stage2_expand_ms": 3032.3121547698975,
"stage3_cluster_ms": 0.02765655517578125,
"stage4_rerank_ms": 2699.3532180786133
},
"stage_counts": {
"stage1_candidates": 37,
"stage1_fallback_used": 1,
"stage2_expanded": 86,
"stage2_unique_paths": 53,
"stage2_duplicate_paths": 33,
"stage3_clustered": 20,
"stage3_strategy": "score",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 3036.3474999964237,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.125,
"rbo_topk": 0.06741929885142856,
"staged_unique_files_topk": 8,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 12703.503900021315,
"num_results": 3,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 83.4202766418457,
"stage2_expand_ms": 9856.60433769226,
"stage3_cluster_ms": 0.0011920928955078125,
"stage4_rerank_ms": 2664.630174636841
},
"stage_counts": {
"stage1_candidates": 3,
"stage2_expanded": 4,
"stage2_unique_paths": 3,
"stage2_duplicate_paths": 1,
"stage3_clustered": 4,
"stage3_strategy": "score",
"stage4_reranked": 4
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 2888.501700013876,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.09090909090909091,
"rbo_topk": 0.23541639942571424,
"staged_unique_files_topk": 2,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 33684.76710000634,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 78.8118839263916,
"stage1_fallback_search_ms": 174.6652126312256,
"stage2_expand_ms": 31018.909692764282,
"stage3_cluster_ms": 0.0016689300537109375,
"stage4_rerank_ms": 2316.9021606445312
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 5,
"stage2_unique_paths": 5,
"stage2_duplicate_paths": 0,
"stage3_clustered": 5,
"stage3_strategy": "score",
"stage4_reranked": 5
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 2824.729699999094,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "graph expansion",
"staged": {
"strategy": "staged",
"query": "graph expansion",
"latency_ms": 16910.090099990368,
"num_results": 8,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 99.6243953704834,
"stage1_fallback_search_ms": 207.89742469787598,
"stage2_expand_ms": 13929.257154464722,
"stage3_cluster_ms": 0.016927719116210938,
"stage4_rerank_ms": 2586.843729019165
},
"stage_counts": {
"stage1_candidates": 11,
"stage1_fallback_used": 1,
"stage2_expanded": 29,
"stage2_unique_paths": 14,
"stage2_duplicate_paths": 15,
"stage3_clustered": 20,
"stage3_strategy": "score",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "graph expansion",
"latency_ms": 2765.958099991083,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.21428571428571427,
"rbo_topk": 0.06893318399142857,
"staged_unique_files_topk": 7,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 6,
"dense_unique_dirs_topk": 4
},
{
"query": "clustering strategy",
"staged": {
"strategy": "staged",
"query": "clustering strategy",
"latency_ms": 8380.20839998126,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 95.42632102966309,
"stage1_fallback_search_ms": 187.4692440032959,
"stage2_expand_ms": 5561.658143997192,
"stage3_cluster_ms": 0.0007152557373046875,
"stage4_rerank_ms": 2441.287040710449
},
"stage_counts": {
"stage1_candidates": 10,
"stage1_fallback_used": 1,
"stage2_expanded": 10,
"stage2_unique_paths": 10,
"stage2_duplicate_paths": 0,
"stage3_clustered": 10,
"stage3_strategy": "score",
"stage4_reranked": 10
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "clustering strategy",
"latency_ms": 2788.0665000081062,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.04670528456571428,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 4
},
{
"query": "error handling",
"staged": {
"strategy": "staged",
"query": "error handling",
"latency_ms": 19897.71709999442,
"num_results": 6,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 114.1653060913086,
"stage1_fallback_search_ms": 235.73827743530273,
"stage2_expand_ms": 16702.077865600586,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 2757.4093341827393
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 13,
"stage2_unique_paths": 6,
"stage2_duplicate_paths": 7,
"stage3_clustered": 13,
"stage3_strategy": "score",
"stage4_reranked": 13
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "error handling",
"latency_ms": 2874.178600013256,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "how to parse json",
"staged": {
"strategy": "staged",
"query": "how to parse json",
"latency_ms": 16949.43529999256,
"num_results": 7,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 104.50935363769531,
"stage1_fallback_search_ms": 190.6723976135254,
"stage2_expand_ms": 14165.841102600098,
"stage3_cluster_ms": 0.0011920928955078125,
"stage4_rerank_ms": 2399.226188659668
},
"stage_counts": {
"stage1_candidates": 4,
"stage1_fallback_used": 1,
"stage2_expanded": 11,
"stage2_unique_paths": 7,
"stage2_duplicate_paths": 4,
"stage3_clustered": 11,
"stage3_strategy": "score",
"stage4_reranked": 11
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "how to parse json",
"latency_ms": 2698.469099998474,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.21428571428571427,
"rbo_topk": 0.16767719827714284,
"staged_unique_files_topk": 7,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
}
]
}

View File

@@ -1,465 +0,0 @@
{
"summary": {
"timestamp": "2026-02-09 20:53:01",
"source": "src",
"k": 10,
"coarse_k": 100,
"query_count": 7,
"avg_jaccard_topk": 0.12384302205730777,
"avg_rbo_topk": 0.09816673566816325,
"staged": {
"success": 7,
"avg_latency_ms": 8696.564499999795
},
"dense_rerank": {
"success": 7,
"avg_latency_ms": 2936.2583857136115
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 6108.304299980402,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 90.47985076904297,
"stage1_fallback_search_ms": 224.38788414001465,
"stage2_expand_ms": 3031.7258834838867,
"stage3_cluster_ms": 0.02956390380859375,
"stage4_rerank_ms": 2655.31849861145
},
"stage_counts": {
"stage1_candidates": 37,
"stage1_fallback_used": 1,
"stage2_expanded": 86,
"stage2_unique_paths": 53,
"stage2_duplicate_paths": 33,
"stage3_clustered": 20,
"stage3_strategy": "score",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 2873.6466999948025,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.125,
"rbo_topk": 0.06741929885142856,
"staged_unique_files_topk": 8,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 9321.754200011492,
"num_results": 3,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 140.43283462524414,
"stage2_expand_ms": 6410.467863082886,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 2675.7972240448
},
"stage_counts": {
"stage1_candidates": 3,
"stage2_expanded": 4,
"stage2_unique_paths": 3,
"stage2_duplicate_paths": 1,
"stage3_clustered": 4,
"stage3_strategy": "score",
"stage4_reranked": 4
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 3104.7773999869823,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.09090909090909091,
"rbo_topk": 0.23541639942571424,
"staged_unique_files_topk": 2,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 9527.073799997568,
"num_results": 6,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 98.59919548034668,
"stage1_fallback_search_ms": 172.26457595825195,
"stage2_expand_ms": 6125.282049179077,
"stage3_cluster_ms": 0.017404556274414062,
"stage4_rerank_ms": 3023.9248275756836
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 31,
"stage2_unique_paths": 11,
"stage2_duplicate_paths": 20,
"stage3_clustered": 20,
"stage3_strategy": "score",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 2901.0302999913692,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.06666666666666667,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 6,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "graph expansion",
"staged": {
"strategy": "staged",
"query": "graph expansion",
"latency_ms": 9120.886200010777,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 91.48454666137695,
"stage1_fallback_search_ms": 172.12390899658203,
"stage2_expand_ms": 6166.24903678894,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 2601.947546005249
},
"stage_counts": {
"stage1_candidates": 11,
"stage1_fallback_used": 1,
"stage2_expanded": 16,
"stage2_unique_paths": 13,
"stage2_duplicate_paths": 3,
"stage3_clustered": 16,
"stage3_strategy": "score",
"stage4_reranked": 16
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "graph expansion",
"latency_ms": 2847.6964999735355,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1875,
"rbo_topk": 0.06134116970571428,
"staged_unique_files_topk": 9,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 7,
"dense_unique_dirs_topk": 4
},
{
"query": "clustering strategy",
"staged": {
"strategy": "staged",
"query": "clustering strategy",
"latency_ms": 8424.535699993372,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 92.8945541381836,
"stage1_fallback_search_ms": 192.06547737121582,
"stage2_expand_ms": 5568.126440048218,
"stage3_cluster_ms": 0.0011920928955078125,
"stage4_rerank_ms": 2480.673313140869
},
"stage_counts": {
"stage1_candidates": 10,
"stage1_fallback_used": 1,
"stage2_expanded": 10,
"stage2_unique_paths": 10,
"stage2_duplicate_paths": 0,
"stage3_clustered": 10,
"stage3_strategy": "score",
"stage4_reranked": 10
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "clustering strategy",
"latency_ms": 2974.9999000132084,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.04670528456571428,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 4
},
{
"query": "error handling",
"staged": {
"strategy": "staged",
"query": "error handling",
"latency_ms": 9253.624700009823,
"num_results": 6,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 102.18691825866699,
"stage1_fallback_search_ms": 176.97691917419434,
"stage2_expand_ms": 6113.626480102539,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 2774.4452953338623
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 13,
"stage2_unique_paths": 6,
"stage2_duplicate_paths": 7,
"stage3_clustered": 13,
"stage3_strategy": "score",
"stage4_reranked": 13
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "error handling",
"latency_ms": 2860.619900047779,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "how to parse json",
"staged": {
"strategy": "staged",
"query": "how to parse json",
"latency_ms": 9119.772599995136,
"num_results": 7,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 90.18850326538086,
"stage1_fallback_search_ms": 157.95397758483887,
"stage2_expand_ms": 6293.469429016113,
"stage3_cluster_ms": 0.0011920928955078125,
"stage4_rerank_ms": 2486.8383407592773
},
"stage_counts": {
"stage1_candidates": 4,
"stage1_fallback_used": 1,
"stage2_expanded": 9,
"stage2_unique_paths": 7,
"stage2_duplicate_paths": 2,
"stage3_clustered": 9,
"stage3_strategy": "score",
"stage4_reranked": 9
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "how to parse json",
"latency_ms": 2991.0379999876022,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.21428571428571427,
"rbo_topk": 0.18590219827714285,
"staged_unique_files_topk": 7,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
}
]
}

View File

@@ -1,465 +0,0 @@
{
"summary": {
"timestamp": "2026-02-10 12:23:36",
"source": "src",
"k": 10,
"coarse_k": 100,
"query_count": 7,
"avg_jaccard_topk": 0.12384302205730777,
"avg_rbo_topk": 0.09816673566816325,
"staged": {
"success": 7,
"avg_latency_ms": 3996.4113285754406
},
"dense_rerank": {
"success": 7,
"avg_latency_ms": 2780.485200004918
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 2365.3048999905586,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 25.228023529052734,
"stage1_fallback_search_ms": 206.0999870300293,
"stage2_expand_ms": 16.644954681396484,
"stage3_cluster_ms": 0.025987625122070312,
"stage4_rerank_ms": 2064.2504692077637
},
"stage_counts": {
"stage1_candidates": 37,
"stage1_fallback_used": 1,
"stage2_expanded": 86,
"stage2_unique_paths": 53,
"stage2_duplicate_paths": 33,
"stage3_clustered": 20,
"stage3_strategy": "score",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 2610.047899991274,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.125,
"rbo_topk": 0.06741929885142856,
"staged_unique_files_topk": 8,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 3723.305599987507,
"num_results": 3,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 31.742334365844727,
"stage2_expand_ms": 2125.1025199890137,
"stage3_cluster_ms": 0.0007152557373046875,
"stage4_rerank_ms": 1511.4071369171143
},
"stage_counts": {
"stage1_candidates": 3,
"stage2_expanded": 4,
"stage2_unique_paths": 3,
"stage2_duplicate_paths": 1,
"stage3_clustered": 4,
"stage3_strategy": "score",
"stage4_reranked": 4
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 2072.4792000055313,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.09090909090909091,
"rbo_topk": 0.23541639942571424,
"staged_unique_files_topk": 2,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 5251.151299983263,
"num_results": 6,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 32.721757888793945,
"stage1_fallback_search_ms": 195.51420211791992,
"stage2_expand_ms": 2060.0733757019043,
"stage3_cluster_ms": 0.0095367431640625,
"stage4_rerank_ms": 2900.8395671844482
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 31,
"stage2_unique_paths": 11,
"stage2_duplicate_paths": 20,
"stage3_clustered": 20,
"stage3_strategy": "score",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 1972.8982000350952,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.06666666666666667,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 6,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "graph expansion",
"staged": {
"strategy": "staged",
"query": "graph expansion",
"latency_ms": 4101.171400010586,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 29.141902923583984,
"stage1_fallback_search_ms": 234.2982292175293,
"stage2_expand_ms": 2082.4878215789795,
"stage3_cluster_ms": 0.0011920928955078125,
"stage4_rerank_ms": 1698.7183094024658
},
"stage_counts": {
"stage1_candidates": 11,
"stage1_fallback_used": 1,
"stage2_expanded": 16,
"stage2_unique_paths": 13,
"stage2_duplicate_paths": 3,
"stage3_clustered": 16,
"stage3_strategy": "score",
"stage4_reranked": 16
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "graph expansion",
"latency_ms": 2331.9747000038624,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1875,
"rbo_topk": 0.06134116970571428,
"staged_unique_files_topk": 9,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 7,
"dense_unique_dirs_topk": 4
},
{
"query": "clustering strategy",
"staged": {
"strategy": "staged",
"query": "clustering strategy",
"latency_ms": 4032.0041000247,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 42.098283767700195,
"stage1_fallback_search_ms": 209.6574306488037,
"stage2_expand_ms": 2053.9097785949707,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 1665.3883457183838
},
"stage_counts": {
"stage1_candidates": 10,
"stage1_fallback_used": 1,
"stage2_expanded": 10,
"stage2_unique_paths": 10,
"stage2_duplicate_paths": 0,
"stage3_clustered": 10,
"stage3_strategy": "score",
"stage4_reranked": 10
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "clustering strategy",
"latency_ms": 2026.5661999881268,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.04670528456571428,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 4
},
{
"query": "error handling",
"staged": {
"strategy": "staged",
"query": "error handling",
"latency_ms": 4237.893900036812,
"num_results": 6,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 64.01538848876953,
"stage1_fallback_search_ms": 225.14033317565918,
"stage2_expand_ms": 2116.3012981414795,
"stage3_cluster_ms": 0.0007152557373046875,
"stage4_rerank_ms": 1776.0803699493408
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 13,
"stage2_unique_paths": 6,
"stage2_duplicate_paths": 7,
"stage3_clustered": 13,
"stage3_strategy": "score",
"stage4_reranked": 13
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "error handling",
"latency_ms": 2125.935900002718,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "how to parse json",
"staged": {
"strategy": "staged",
"query": "how to parse json",
"latency_ms": 4264.048099994659,
"num_results": 7,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 31.972646713256836,
"stage1_fallback_search_ms": 235.47840118408203,
"stage2_expand_ms": 2161.5889072418213,
"stage3_cluster_ms": 0.0007152557373046875,
"stage4_rerank_ms": 1768.0847644805908
},
"stage_counts": {
"stage1_candidates": 4,
"stage1_fallback_used": 1,
"stage2_expanded": 9,
"stage2_unique_paths": 7,
"stage2_duplicate_paths": 2,
"stage3_clustered": 9,
"stage3_strategy": "score",
"stage4_reranked": 9
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "how to parse json",
"latency_ms": 6323.49430000782,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.21428571428571427,
"rbo_topk": 0.18590219827714285,
"staged_unique_files_topk": 7,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
}
]
}

View File

@@ -1,467 +0,0 @@
{
"summary": {
"timestamp": "2026-02-10 12:46:47",
"source": "src",
"k": 10,
"coarse_k": 100,
"query_count": 7,
"avg_jaccard_topk": 0.11350467619264612,
"avg_rbo_topk": 0.09062624799510204,
"staged": {
"success": 7,
"avg_latency_ms": 5670.9065000244545
},
"dense_rerank": {
"success": 7,
"avg_latency_ms": 3047.475757143327
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 2971.5892000496387,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 108.11758041381836,
"stage1_fallback_search_ms": 230.96132278442383,
"stage2_expand_ms": 18.60976219177246,
"stage3_cluster_ms": 1.100301742553711,
"stage4_rerank_ms": 2528.761625289917
},
"stage_counts": {
"stage1_candidates": 37,
"stage1_fallback_used": 1,
"stage2_expanded": 86,
"stage2_unique_paths": 53,
"stage2_duplicate_paths": 33,
"stage3_clustered": 20,
"stage3_strategy": "dir_rr",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 2937.113800019026,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.05263157894736842,
"rbo_topk": 0.014635885139999999,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 8,
"dense_unique_dirs_topk": 4
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 10065.153400033712,
"num_results": 3,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 127.17461585998535,
"stage2_expand_ms": 7361.833810806274,
"stage3_cluster_ms": 0.001430511474609375,
"stage4_rerank_ms": 2472.7542400360107
},
"stage_counts": {
"stage1_candidates": 3,
"stage2_expanded": 4,
"stage2_unique_paths": 3,
"stage2_duplicate_paths": 1,
"stage3_clustered": 4,
"stage3_strategy": "dir_rr",
"stage4_reranked": 4
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 3059.5018000006676,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.09090909090909091,
"rbo_topk": 0.23541639942571424,
"staged_unique_files_topk": 2,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 5557.314100056887,
"num_results": 7,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 133.9263916015625,
"stage1_fallback_search_ms": 242.1243190765381,
"stage2_expand_ms": 2106.602430343628,
"stage3_cluster_ms": 0.47016143798828125,
"stage4_rerank_ms": 2967.3829078674316
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 31,
"stage2_unique_paths": 11,
"stage2_duplicate_paths": 20,
"stage3_clustered": 20,
"stage3_strategy": "dir_rr",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 3157.7918999791145,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.06666666666666667,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 6,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "graph expansion",
"staged": {
"strategy": "staged",
"query": "graph expansion",
"latency_ms": 5458.670999974012,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 113.62957954406738,
"stage1_fallback_search_ms": 204.56886291503906,
"stage2_expand_ms": 2166.4509773254395,
"stage3_cluster_ms": 0.0011920928955078125,
"stage4_rerank_ms": 2872.969627380371
},
"stage_counts": {
"stage1_candidates": 11,
"stage1_fallback_used": 1,
"stage2_expanded": 16,
"stage2_unique_paths": 13,
"stage2_duplicate_paths": 3,
"stage3_clustered": 16,
"stage3_strategy": "dir_rr",
"stage4_reranked": 16
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "graph expansion",
"latency_ms": 2896.5341999828815,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1875,
"rbo_topk": 0.06134116970571428,
"staged_unique_files_topk": 9,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 7,
"dense_unique_dirs_topk": 4
},
{
"query": "clustering strategy",
"staged": {
"strategy": "staged",
"query": "clustering strategy",
"latency_ms": 5028.861099988222,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 111.71293258666992,
"stage1_fallback_search_ms": 192.02208518981934,
"stage2_expand_ms": 2054.065465927124,
"stage3_cluster_ms": 0.0007152557373046875,
"stage4_rerank_ms": 2579.0507793426514
},
"stage_counts": {
"stage1_candidates": 10,
"stage1_fallback_used": 1,
"stage2_expanded": 10,
"stage2_unique_paths": 10,
"stage2_duplicate_paths": 0,
"stage3_clustered": 10,
"stage3_strategy": "dir_rr",
"stage4_reranked": 10
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "clustering strategy",
"latency_ms": 3627.1755999922752,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.04670528456571428,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 4
},
{
"query": "error handling",
"staged": {
"strategy": "staged",
"query": "error handling",
"latency_ms": 5114.356300055981,
"num_results": 6,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 135.76626777648926,
"stage1_fallback_search_ms": 211.12942695617676,
"stage2_expand_ms": 2151.059150695801,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 2519.892692565918
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 13,
"stage2_unique_paths": 6,
"stage2_duplicate_paths": 7,
"stage3_clustered": 13,
"stage3_strategy": "dir_rr",
"stage4_reranked": 13
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "error handling",
"latency_ms": 2853.594000041485,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "how to parse json",
"staged": {
"strategy": "staged",
"query": "how to parse json",
"latency_ms": 5500.400400012732,
"num_results": 7,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 96.66872024536133,
"stage1_fallback_search_ms": 176.37205123901367,
"stage2_expand_ms": 2137.751340866089,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 2991.840124130249
},
"stage_counts": {
"stage1_candidates": 4,
"stage1_fallback_used": 1,
"stage2_expanded": 9,
"stage2_unique_paths": 7,
"stage2_duplicate_paths": 2,
"stage3_clustered": 9,
"stage3_strategy": "dir_rr",
"stage4_reranked": 9
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "how to parse json",
"latency_ms": 2800.6189999878407,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.21428571428571427,
"rbo_topk": 0.18590219827714285,
"staged_unique_files_topk": 7,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
}
]
}

View File

@@ -1,465 +0,0 @@
{
"summary": {
"timestamp": "2026-02-10 12:52:44",
"source": "src",
"k": 10,
"coarse_k": 100,
"query_count": 7,
"avg_jaccard_topk": 0.13455730777159347,
"avg_rbo_topk": 0.10274807844326529,
"staged": {
"success": 7,
"avg_latency_ms": 4445.262371412346
},
"dense_rerank": {
"success": 7,
"avg_latency_ms": 3327.1750857276575
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 2719.7998999655247,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 33.12373161315918,
"stage1_fallback_search_ms": 230.31878471374512,
"stage2_expand_ms": 22.444486618041992,
"stage3_cluster_ms": 0.06079673767089844,
"stage4_rerank_ms": 2338.5443687438965
},
"stage_counts": {
"stage1_candidates": 37,
"stage1_fallback_used": 1,
"stage2_expanded": 86,
"stage2_unique_paths": 53,
"stage2_duplicate_paths": 33,
"stage3_clustered": 20,
"stage3_strategy": "path",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 2334.8668000102043,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.2,
"rbo_topk": 0.09948869827714285,
"staged_unique_files_topk": 8,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 4470.056899994612,
"num_results": 3,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 28.5646915435791,
"stage2_expand_ms": 2216.57133102417,
"stage3_cluster_ms": 0.001430511474609375,
"stage4_rerank_ms": 2131.246566772461
},
"stage_counts": {
"stage1_candidates": 3,
"stage2_expanded": 4,
"stage2_unique_paths": 3,
"stage2_duplicate_paths": 1,
"stage3_clustered": 4,
"stage3_strategy": "path",
"stage4_reranked": 4
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 2447.341199964285,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.09090909090909091,
"rbo_topk": 0.23541639942571424,
"staged_unique_files_topk": 2,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 6126.65680000186,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 25.135278701782227,
"stage1_fallback_search_ms": 171.53453826904297,
"stage2_expand_ms": 2094.9013233184814,
"stage3_cluster_ms": 0.024318695068359375,
"stage4_rerank_ms": 3743.204355239868
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 31,
"stage2_unique_paths": 11,
"stage2_duplicate_paths": 20,
"stage3_clustered": 11,
"stage3_strategy": "path",
"stage4_reranked": 11
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 9015.508300036192,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.06666666666666667,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 6,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "graph expansion",
"staged": {
"strategy": "staged",
"query": "graph expansion",
"latency_ms": 4319.597599953413,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 18.799781799316406,
"stage1_fallback_search_ms": 167.36602783203125,
"stage2_expand_ms": 2101.4957427978516,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 1976.8805503845215
},
"stage_counts": {
"stage1_candidates": 11,
"stage1_fallback_used": 1,
"stage2_expanded": 16,
"stage2_unique_paths": 13,
"stage2_duplicate_paths": 3,
"stage3_clustered": 16,
"stage3_strategy": "path",
"stage4_reranked": 16
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "graph expansion",
"latency_ms": 2356.994699984789,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1875,
"rbo_topk": 0.06134116970571428,
"staged_unique_files_topk": 9,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 7,
"dense_unique_dirs_topk": 4
},
{
"query": "clustering strategy",
"staged": {
"strategy": "staged",
"query": "clustering strategy",
"latency_ms": 4574.691199988127,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 45.72629928588867,
"stage1_fallback_search_ms": 233.0036163330078,
"stage2_expand_ms": 2068.8536167144775,
"stage3_cluster_ms": 0.0007152557373046875,
"stage4_rerank_ms": 2152.9064178466797
},
"stage_counts": {
"stage1_candidates": 10,
"stage1_fallback_used": 1,
"stage2_expanded": 10,
"stage2_unique_paths": 10,
"stage2_duplicate_paths": 0,
"stage3_clustered": 10,
"stage3_strategy": "path",
"stage4_reranked": 10
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "clustering strategy",
"latency_ms": 2311.4787000119686,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.04670528456571428,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 4
},
{
"query": "error handling",
"staged": {
"strategy": "staged",
"query": "error handling",
"latency_ms": 4616.5374999940395,
"num_results": 6,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 38.83004188537598,
"stage1_fallback_search_ms": 263.0441188812256,
"stage2_expand_ms": 2070.7976818084717,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 2133.629083633423
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 13,
"stage2_unique_paths": 6,
"stage2_duplicate_paths": 7,
"stage3_clustered": 13,
"stage3_strategy": "path",
"stage4_reranked": 13
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "error handling",
"latency_ms": 2337.4413000643253,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "how to parse json",
"staged": {
"strategy": "staged",
"query": "how to parse json",
"latency_ms": 4289.496699988842,
"num_results": 7,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 34.40546989440918,
"stage1_fallback_search_ms": 231.8587303161621,
"stage2_expand_ms": 2068.8445568084717,
"stage3_cluster_ms": 0.0007152557373046875,
"stage4_rerank_ms": 1850.6083488464355
},
"stage_counts": {
"stage1_candidates": 4,
"stage1_fallback_used": 1,
"stage2_expanded": 9,
"stage2_unique_paths": 7,
"stage2_duplicate_paths": 2,
"stage3_clustered": 9,
"stage3_strategy": "path",
"stage4_reranked": 9
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "how to parse json",
"latency_ms": 2486.594600021839,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.21428571428571427,
"rbo_topk": 0.18590219827714285,
"staged_unique_files_topk": 7,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
}
]
}

View File

@@ -1,465 +0,0 @@
{
"summary": {
"timestamp": "2026-02-10 12:44:24",
"source": "src",
"k": 10,
"coarse_k": 100,
"query_count": 7,
"avg_jaccard_topk": 0.12384302205730777,
"avg_rbo_topk": 0.09816673566816325,
"staged": {
"success": 7,
"avg_latency_ms": 4603.035771421024
},
"dense_rerank": {
"success": 7,
"avg_latency_ms": 2776.139728575945
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 3544.4309000074863,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 34.082651138305664,
"stage1_fallback_search_ms": 217.52095222473145,
"stage2_expand_ms": 18.847942352294922,
"stage3_cluster_ms": 0.031948089599609375,
"stage4_rerank_ms": 3176.4564514160156
},
"stage_counts": {
"stage1_candidates": 37,
"stage1_fallback_used": 1,
"stage2_expanded": 86,
"stage2_unique_paths": 53,
"stage2_duplicate_paths": 33,
"stage3_clustered": 20,
"stage3_strategy": "score",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 3075.5329999923706,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.125,
"rbo_topk": 0.06741929885142856,
"staged_unique_files_topk": 8,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 4371.493600010872,
"num_results": 3,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 29.517173767089844,
"stage2_expand_ms": 2236.224412918091,
"stage3_cluster_ms": 0.0007152557373046875,
"stage4_rerank_ms": 1998.866319656372
},
"stage_counts": {
"stage1_candidates": 3,
"stage2_expanded": 4,
"stage2_unique_paths": 3,
"stage2_duplicate_paths": 1,
"stage3_clustered": 4,
"stage3_strategy": "score",
"stage4_reranked": 4
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 2334.758200019598,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.09090909090909091,
"rbo_topk": 0.23541639942571424,
"staged_unique_files_topk": 2,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 4143.470999985933,
"num_results": 6,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 20.66636085510254,
"stage1_fallback_search_ms": 150.6054401397705,
"stage2_expand_ms": 2064.2361640930176,
"stage3_cluster_ms": 0.012159347534179688,
"stage4_rerank_ms": 1838.1483554840088
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 31,
"stage2_unique_paths": 11,
"stage2_duplicate_paths": 20,
"stage3_clustered": 20,
"stage3_strategy": "score",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 2207.86700001359,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.06666666666666667,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 6,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "graph expansion",
"staged": {
"strategy": "staged",
"query": "graph expansion",
"latency_ms": 4234.638899981976,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 21.48127555847168,
"stage1_fallback_search_ms": 153.59735488891602,
"stage2_expand_ms": 2092.521905899048,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 1876.7595291137695
},
"stage_counts": {
"stage1_candidates": 11,
"stage1_fallback_used": 1,
"stage2_expanded": 16,
"stage2_unique_paths": 13,
"stage2_duplicate_paths": 3,
"stage3_clustered": 16,
"stage3_strategy": "score",
"stage4_reranked": 16
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "graph expansion",
"latency_ms": 2646.9266000390053,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1875,
"rbo_topk": 0.06134116970571428,
"staged_unique_files_topk": 9,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 7,
"dense_unique_dirs_topk": 4
},
{
"query": "clustering strategy",
"staged": {
"strategy": "staged",
"query": "clustering strategy",
"latency_ms": 4778.165899991989,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 18.590688705444336,
"stage1_fallback_search_ms": 195.90282440185547,
"stage2_expand_ms": 2053.685426712036,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 2431.095838546753
},
"stage_counts": {
"stage1_candidates": 10,
"stage1_fallback_used": 1,
"stage2_expanded": 10,
"stage2_unique_paths": 10,
"stage2_duplicate_paths": 0,
"stage3_clustered": 10,
"stage3_strategy": "score",
"stage4_reranked": 10
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "clustering strategy",
"latency_ms": 2887.1304000020027,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.04670528456571428,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 4
},
{
"query": "error handling",
"staged": {
"strategy": "staged",
"query": "error handling",
"latency_ms": 5823.889799982309,
"num_results": 6,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 109.02619361877441,
"stage1_fallback_search_ms": 196.54059410095215,
"stage2_expand_ms": 2088.4640216827393,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 3328.0465602874756
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 13,
"stage2_unique_paths": 6,
"stage2_duplicate_paths": 7,
"stage3_clustered": 13,
"stage3_strategy": "score",
"stage4_reranked": 13
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "error handling",
"latency_ms": 3351.872999995947,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "how to parse json",
"staged": {
"strategy": "staged",
"query": "how to parse json",
"latency_ms": 5325.160299986601,
"num_results": 7,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 216.71128273010254,
"stage1_fallback_search_ms": 295.27878761291504,
"stage2_expand_ms": 2091.4883613586426,
"stage3_cluster_ms": 0.001430511474609375,
"stage4_rerank_ms": 2606.9161891937256
},
"stage_counts": {
"stage1_candidates": 4,
"stage1_fallback_used": 1,
"stage2_expanded": 9,
"stage2_unique_paths": 7,
"stage2_duplicate_paths": 2,
"stage3_clustered": 9,
"stage3_strategy": "score",
"stage4_reranked": 9
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "how to parse json",
"latency_ms": 2928.889899969101,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.21428571428571427,
"rbo_topk": 0.18590219827714285,
"staged_unique_files_topk": 7,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
}
]
}

View File

@@ -1,467 +0,0 @@
{
"summary": {
"timestamp": "2026-02-11 15:16:08",
"source": "codex-lens\\src",
"k": 10,
"coarse_k": 100,
"query_count": 7,
"avg_jaccard_topk": 0.11350467619264612,
"avg_rbo_topk": 0.09062624799510204,
"staged": {
"success": 7,
"avg_latency_ms": 4507.475014303412
},
"dense_rerank": {
"success": 7,
"avg_latency_ms": 2537.8563000304357
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 2474.800100028515,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 91.76826477050781,
"stage1_fallback_search_ms": 162.45269775390625,
"stage2_expand_ms": 14.957904815673828,
"stage3_cluster_ms": 0.8461475372314453,
"stage4_rerank_ms": 2129.7342777252197
},
"stage_counts": {
"stage1_candidates": 37,
"stage1_fallback_used": 1,
"stage2_expanded": 86,
"stage2_unique_paths": 53,
"stage2_duplicate_paths": 33,
"stage3_clustered": 20,
"stage3_strategy": "dir_rr",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 2425.3046000003815,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.05263157894736842,
"rbo_topk": 0.014635885139999999,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 8,
"dense_unique_dirs_topk": 4
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 5389.070900022984,
"num_results": 3,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 63.6446475982666,
"stage2_expand_ms": 3202.108144760132,
"stage3_cluster_ms": 0.0007152557373046875,
"stage4_rerank_ms": 2011.8708610534668
},
"stage_counts": {
"stage1_candidates": 3,
"stage2_expanded": 4,
"stage2_unique_paths": 3,
"stage2_duplicate_paths": 1,
"stage3_clustered": 4,
"stage3_strategy": "dir_rr",
"stage4_reranked": 4
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 2465.9148000478745,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.09090909090909091,
"rbo_topk": 0.23541639942571424,
"staged_unique_files_topk": 2,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 4989.407700002193,
"num_results": 7,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 88.54341506958008,
"stage1_fallback_search_ms": 125.9164810180664,
"stage2_expand_ms": 2063.6398792266846,
"stage3_cluster_ms": 0.3476142883300781,
"stage4_rerank_ms": 2633.7506771087646
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 31,
"stage2_unique_paths": 11,
"stage2_duplicate_paths": 20,
"stage3_clustered": 20,
"stage3_strategy": "dir_rr",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 2424.8579000234604,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.06666666666666667,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 6,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "graph expansion",
"staged": {
"strategy": "staged",
"query": "graph expansion",
"latency_ms": 4771.1614000201225,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 61.426401138305664,
"stage1_fallback_search_ms": 152.01711654663086,
"stage2_expand_ms": 2078.4833431243896,
"stage3_cluster_ms": 0.0007152557373046875,
"stage4_rerank_ms": 2376.2998580932617
},
"stage_counts": {
"stage1_candidates": 11,
"stage1_fallback_used": 1,
"stage2_expanded": 16,
"stage2_unique_paths": 13,
"stage2_duplicate_paths": 3,
"stage3_clustered": 16,
"stage3_strategy": "dir_rr",
"stage4_reranked": 16
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "graph expansion",
"latency_ms": 2418.981700003147,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1875,
"rbo_topk": 0.06134116970571428,
"staged_unique_files_topk": 9,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 7,
"dense_unique_dirs_topk": 4
},
{
"query": "clustering strategy",
"staged": {
"strategy": "staged",
"query": "clustering strategy",
"latency_ms": 4559.269900023937,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 60.93573570251465,
"stage1_fallback_search_ms": 141.4163112640381,
"stage2_expand_ms": 2032.2721004486084,
"stage3_cluster_ms": 0.0007152557373046875,
"stage4_rerank_ms": 2217.2317504882812
},
"stage_counts": {
"stage1_candidates": 10,
"stage1_fallback_used": 1,
"stage2_expanded": 10,
"stage2_unique_paths": 10,
"stage2_duplicate_paths": 0,
"stage3_clustered": 10,
"stage3_strategy": "dir_rr",
"stage4_reranked": 10
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "clustering strategy",
"latency_ms": 2443.3700000047684,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.04670528456571428,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 4
},
{
"query": "error handling",
"staged": {
"strategy": "staged",
"query": "error handling",
"latency_ms": 4757.269500017166,
"num_results": 6,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 89.56503868103027,
"stage1_fallback_search_ms": 143.58854293823242,
"stage2_expand_ms": 2119.623899459839,
"stage3_cluster_ms": 0.0007152557373046875,
"stage4_rerank_ms": 2303.9650917053223
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 13,
"stage2_unique_paths": 6,
"stage2_duplicate_paths": 7,
"stage3_clustered": 13,
"stage3_strategy": "dir_rr",
"stage4_reranked": 13
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "error handling",
"latency_ms": 2431.0521000623703,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "how to parse json",
"staged": {
"strategy": "staged",
"query": "how to parse json",
"latency_ms": 4611.3456000089645,
"num_results": 7,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 74.86128807067871,
"stage1_fallback_search_ms": 137.465238571167,
"stage2_expand_ms": 2086.426019668579,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 2218.2157039642334
},
"stage_counts": {
"stage1_candidates": 4,
"stage1_fallback_used": 1,
"stage2_expanded": 9,
"stage2_unique_paths": 7,
"stage2_duplicate_paths": 2,
"stage3_clustered": 9,
"stage3_strategy": "dir_rr",
"stage4_reranked": 9
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "how to parse json",
"latency_ms": 3155.5130000710487,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.21428571428571427,
"rbo_topk": 0.18590219827714285,
"staged_unique_files_topk": 7,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
}
]
}

View File

@@ -1,465 +0,0 @@
{
"summary": {
"timestamp": "2026-02-11 15:12:41",
"source": "codex-lens\\src",
"k": 10,
"coarse_k": 100,
"query_count": 7,
"avg_jaccard_topk": 0.13455730777159347,
"avg_rbo_topk": 0.10274807844326529,
"staged": {
"success": 7,
"avg_latency_ms": 4532.43382857527
},
"dense_rerank": {
"success": 7,
"avg_latency_ms": 2712.3431142909185
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 2704.6869000196457,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 56.32758140563965,
"stage1_fallback_search_ms": 156.8472385406494,
"stage2_expand_ms": 15.436887741088867,
"stage3_cluster_ms": 0.04291534423828125,
"stage4_rerank_ms": 2388.756513595581
},
"stage_counts": {
"stage1_candidates": 37,
"stage1_fallback_used": 1,
"stage2_expanded": 86,
"stage2_unique_paths": 53,
"stage2_duplicate_paths": 33,
"stage3_clustered": 20,
"stage3_strategy": "path",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 3257.856599986553,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.2,
"rbo_topk": 0.09948869827714285,
"staged_unique_files_topk": 8,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 4347.2081000208855,
"num_results": 3,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 65.37723541259766,
"stage2_expand_ms": 2145.587682723999,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 2052.9236793518066
},
"stage_counts": {
"stage1_candidates": 3,
"stage2_expanded": 4,
"stage2_unique_paths": 3,
"stage2_duplicate_paths": 1,
"stage3_clustered": 4,
"stage3_strategy": "path",
"stage4_reranked": 4
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 2642.404200077057,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.09090909090909091,
"rbo_topk": 0.23541639942571424,
"staged_unique_files_topk": 2,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 4627.254400074482,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 96.67634963989258,
"stage1_fallback_search_ms": 162.25123405456543,
"stage2_expand_ms": 2071.5224742889404,
"stage3_cluster_ms": 0.018835067749023438,
"stage4_rerank_ms": 2211.8191719055176
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 31,
"stage2_unique_paths": 11,
"stage2_duplicate_paths": 20,
"stage3_clustered": 11,
"stage3_strategy": "path",
"stage4_reranked": 11
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 2479.5284999608994,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.06666666666666667,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 6,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "graph expansion",
"staged": {
"strategy": "staged",
"query": "graph expansion",
"latency_ms": 4663.639899969101,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 82.36384391784668,
"stage1_fallback_search_ms": 158.2353115081787,
"stage2_expand_ms": 2087.8846645355225,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 2249.4378089904785
},
"stage_counts": {
"stage1_candidates": 11,
"stage1_fallback_used": 1,
"stage2_expanded": 16,
"stage2_unique_paths": 13,
"stage2_duplicate_paths": 3,
"stage3_clustered": 16,
"stage3_strategy": "path",
"stage4_reranked": 16
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "graph expansion",
"latency_ms": 2455.024599969387,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1875,
"rbo_topk": 0.06134116970571428,
"staged_unique_files_topk": 9,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 7,
"dense_unique_dirs_topk": 4
},
{
"query": "clustering strategy",
"staged": {
"strategy": "staged",
"query": "clustering strategy",
"latency_ms": 6402.90189999342,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 44.295310974121094,
"stage1_fallback_search_ms": 127.30145454406738,
"stage2_expand_ms": 2030.930995941162,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 4132.822036743164
},
"stage_counts": {
"stage1_candidates": 10,
"stage1_fallback_used": 1,
"stage2_expanded": 10,
"stage2_unique_paths": 10,
"stage2_duplicate_paths": 0,
"stage3_clustered": 10,
"stage3_strategy": "path",
"stage4_reranked": 10
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "clustering strategy",
"latency_ms": 3286.4142000079155,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.04670528456571428,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 4
},
{
"query": "error handling",
"staged": {
"strategy": "staged",
"query": "error handling",
"latency_ms": 4532.2757999897,
"num_results": 6,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 85.02960205078125,
"stage1_fallback_search_ms": 146.46339416503906,
"stage2_expand_ms": 2071.5532302856445,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 2140.7644748687744
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 13,
"stage2_unique_paths": 6,
"stage2_duplicate_paths": 7,
"stage3_clustered": 13,
"stage3_strategy": "path",
"stage4_reranked": 13
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "error handling",
"latency_ms": 2349.7827999591827,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "how to parse json",
"staged": {
"strategy": "staged",
"query": "how to parse json",
"latency_ms": 4449.06979995966,
"num_results": 7,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 67.15631484985352,
"stage1_fallback_search_ms": 148.30541610717773,
"stage2_expand_ms": 2069.3678855895996,
"stage3_cluster_ms": 0.0007152557373046875,
"stage4_rerank_ms": 2097.882032394409
},
"stage_counts": {
"stage1_candidates": 4,
"stage1_fallback_used": 1,
"stage2_expanded": 9,
"stage2_unique_paths": 7,
"stage2_duplicate_paths": 2,
"stage3_clustered": 9,
"stage3_strategy": "path",
"stage4_reranked": 9
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "how to parse json",
"latency_ms": 2515.3909000754356,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.21428571428571427,
"rbo_topk": 0.18590219827714285,
"staged_unique_files_topk": 7,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
}
]
}

View File

@@ -1,465 +0,0 @@
{
"summary": {
"timestamp": "2026-02-11 15:14:25",
"source": "codex-lens\\src",
"k": 10,
"coarse_k": 100,
"query_count": 7,
"avg_jaccard_topk": 0.12384302205730777,
"avg_rbo_topk": 0.09816673566816325,
"staged": {
"success": 7,
"avg_latency_ms": 4538.7477714674815
},
"dense_rerank": {
"success": 7,
"avg_latency_ms": 2568.1517999768257
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 2546.395000040531,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 70.5413818359375,
"stage1_fallback_search_ms": 165.39907455444336,
"stage2_expand_ms": 15.58542251586914,
"stage3_cluster_ms": 0.020265579223632812,
"stage4_rerank_ms": 2209.89727973938
},
"stage_counts": {
"stage1_candidates": 37,
"stage1_fallback_used": 1,
"stage2_expanded": 86,
"stage2_unique_paths": 53,
"stage2_duplicate_paths": 33,
"stage3_clustered": 20,
"stage3_strategy": "score",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 2610.328099966049,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.125,
"rbo_topk": 0.06741929885142856,
"staged_unique_files_topk": 8,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 4569.872200012207,
"num_results": 3,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\entities.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 96.31776809692383,
"stage2_expand_ms": 2299.86310005188,
"stage3_cluster_ms": 0.0007152557373046875,
"stage4_rerank_ms": 2094.2182540893555
},
"stage_counts": {
"stage1_candidates": 3,
"stage2_expanded": 4,
"stage2_unique_paths": 3,
"stage2_duplicate_paths": 1,
"stage3_clustered": 4,
"stage3_strategy": "score",
"stage4_reranked": 4
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 2509.9732999801636,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.09090909090909091,
"rbo_topk": 0.23541639942571424,
"staged_unique_files_topk": 2,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 5064.990800082684,
"num_results": 6,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\keepalive_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 86.1806869506836,
"stage1_fallback_search_ms": 150.21824836730957,
"stage2_expand_ms": 2080.6803703308105,
"stage3_cluster_ms": 0.011682510375976562,
"stage4_rerank_ms": 2663.7954711914062
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 31,
"stage2_unique_paths": 11,
"stage2_duplicate_paths": 20,
"stage3_clustered": 20,
"stage3_strategy": "score",
"stage4_reranked": 20
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 2778.6906000375748,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.06666666666666667,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 6,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 2,
"dense_unique_dirs_topk": 4
},
{
"query": "graph expansion",
"staged": {
"strategy": "staged",
"query": "graph expansion",
"latency_ms": 4816.586899995804,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_007_add_graph_neighbors.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_graph_builder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 79.48184013366699,
"stage1_fallback_search_ms": 158.03027153015137,
"stage2_expand_ms": 2087.271213531494,
"stage3_cluster_ms": 0.0007152557373046875,
"stage4_rerank_ms": 2410.567283630371
},
"stage_counts": {
"stage1_candidates": 11,
"stage1_fallback_used": 1,
"stage2_expanded": 16,
"stage2_unique_paths": 13,
"stage2_duplicate_paths": 3,
"stage3_clustered": 16,
"stage3_strategy": "score",
"stage4_reranked": 16
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "graph expansion",
"latency_ms": 2692.1504999399185,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1875,
"rbo_topk": 0.06134116970571428,
"staged_unique_files_topk": 9,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 7,
"dense_unique_dirs_topk": 4
},
{
"query": "clustering strategy",
"staged": {
"strategy": "staged",
"query": "clustering strategy",
"latency_ms": 4494.9805000424385,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\config.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\__init__.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 40.569305419921875,
"stage1_fallback_search_ms": 141.06035232543945,
"stage2_expand_ms": 2043.9364910125732,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 2198.4200477600098
},
"stage_counts": {
"stage1_candidates": 10,
"stage1_fallback_used": 1,
"stage2_expanded": 10,
"stage2_unique_paths": 10,
"stage2_duplicate_paths": 0,
"stage3_clustered": 10,
"stage3_strategy": "score",
"stage4_reranked": 10
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "clustering strategy",
"latency_ms": 2474.2726999521255,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.04670528456571428,
"staged_unique_files_topk": 10,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 4
},
{
"query": "error handling",
"staged": {
"strategy": "staged",
"query": "error handling",
"latency_ms": 5652.523400068283,
"num_results": 6,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\gpu_support.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 87.34393119812012,
"stage1_fallback_search_ms": 149.7325897216797,
"stage2_expand_ms": 2072.728157043457,
"stage3_cluster_ms": 0.00095367431640625,
"stage4_rerank_ms": 3190.687894821167
},
"stage_counts": {
"stage1_candidates": 5,
"stage1_fallback_used": 1,
"stage2_expanded": 13,
"stage2_unique_paths": 6,
"stage2_duplicate_paths": 7,
"stage3_clustered": 13,
"stage3_strategy": "score",
"stage4_reranked": 13
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "error handling",
"latency_ms": 2481.709800004959,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.07142857142857142,
"rbo_topk": 0.045191399425714276,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 4
},
{
"query": "how to parse json",
"staged": {
"strategy": "staged",
"query": "how to parse json",
"latency_ms": 4625.885600030422,
"num_results": 7,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 92.83590316772461,
"stage1_fallback_search_ms": 147.12858200073242,
"stage2_expand_ms": 2061.2568855285645,
"stage3_cluster_ms": 0.0011920928955078125,
"stage4_rerank_ms": 2246.800184249878
},
"stage_counts": {
"stage1_candidates": 4,
"stage1_fallback_used": 1,
"stage2_expanded": 9,
"stage2_unique_paths": 7,
"stage2_duplicate_paths": 2,
"stage3_clustered": 9,
"stage3_strategy": "score",
"stage4_reranked": 9
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "how to parse json",
"latency_ms": 2429.9375999569893,
"num_results": 10,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.21428571428571427,
"rbo_topk": 0.18590219827714285,
"staged_unique_files_topk": 7,
"dense_unique_files_topk": 10,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 4
}
]
}

View File

@@ -1,406 +0,0 @@
{
"storage_analysis": {
"tables": {
"code_relationships": {
"row_count": 0,
"columns": [
"id",
"source_symbol_id",
"target_qualified_name",
"relationship_type",
"source_line",
"target_file"
]
},
"embeddings_config": {
"row_count": 1,
"columns": [
"id",
"model_profile",
"model_name",
"embedding_dim",
"backend",
"created_at",
"updated_at"
]
},
"file_keywords": {
"row_count": 0,
"columns": [
"file_id",
"keyword_id"
]
},
"files": {
"row_count": 0,
"columns": [
"id",
"name",
"full_path",
"language",
"content",
"mtime",
"line_count"
]
},
"files_fts_exact": {
"row_count": 0,
"columns": [
"name",
"full_path",
"content"
]
},
"files_fts_exact_config": {
"row_count": 1,
"columns": [
"k",
"v"
]
},
"files_fts_exact_data": {
"row_count": 2,
"columns": [
"id",
"block"
]
},
"files_fts_exact_docsize": {
"row_count": 0,
"columns": [
"id",
"sz"
]
},
"files_fts_exact_idx": {
"row_count": 0,
"columns": [
"segid",
"term",
"pgno"
]
},
"files_fts_fuzzy": {
"row_count": 0,
"columns": [
"name",
"full_path",
"content"
]
},
"files_fts_fuzzy_config": {
"row_count": 1,
"columns": [
"k",
"v"
]
},
"files_fts_fuzzy_data": {
"row_count": 2,
"columns": [
"id",
"block"
]
},
"files_fts_fuzzy_docsize": {
"row_count": 0,
"columns": [
"id",
"sz"
]
},
"files_fts_fuzzy_idx": {
"row_count": 0,
"columns": [
"segid",
"term",
"pgno"
]
},
"graph_neighbors": {
"row_count": 0,
"columns": [
"source_symbol_id",
"neighbor_symbol_id",
"relationship_depth"
]
},
"keywords": {
"row_count": 0,
"columns": [
"id",
"keyword"
]
},
"merkle_hashes": {
"row_count": 0,
"columns": [
"file_id",
"sha256",
"updated_at"
]
},
"merkle_state": {
"row_count": 1,
"columns": [
"id",
"root_hash",
"updated_at"
]
},
"semantic_chunks": {
"row_count": 0,
"columns": [
"id",
"file_path",
"content",
"embedding",
"metadata",
"created_at",
"embedding_binary",
"embedding_dense"
]
},
"semantic_metadata": {
"row_count": 0,
"columns": [
"id",
"file_id",
"summary",
"purpose",
"llm_tool",
"generated_at"
]
},
"sqlite_sequence": {
"row_count": 0,
"columns": [
"name",
"seq"
]
},
"subdirs": {
"row_count": 2,
"columns": [
"id",
"name",
"index_path",
"files_count",
"last_updated"
]
},
"symbols": {
"row_count": 0,
"columns": [
"id",
"file_id",
"name",
"kind",
"start_line",
"end_line"
]
}
},
"conflicts": [],
"recommendations": [
"Found 10 FTS tables: ['files_fts_exact', 'files_fts_exact_config', 'files_fts_exact_data', 'files_fts_exact_docsize', 'files_fts_exact_idx', 'files_fts_fuzzy', 'files_fts_fuzzy_config', 'files_fts_fuzzy_data', 'files_fts_fuzzy_docsize', 'files_fts_fuzzy_idx']. Dual FTS (exact + fuzzy) is properly configured."
]
},
"contribution_analysis": {
"per_query": [
{
"query": "binary quantization",
"methods": {
"fts_exact": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
},
"fts_fuzzy": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
},
"vector": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
},
"splade": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
}
},
"fusion_analysis": {},
"overlaps": {}
},
{
"query": "hamming distance search",
"methods": {
"fts_exact": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
},
"fts_fuzzy": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
},
"vector": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
},
"splade": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
}
},
"fusion_analysis": {},
"overlaps": {}
},
{
"query": "embeddings generation",
"methods": {
"fts_exact": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
},
"fts_fuzzy": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
},
"vector": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
},
"splade": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
}
},
"fusion_analysis": {},
"overlaps": {}
},
{
"query": "reranking algorithm",
"methods": {
"fts_exact": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
},
"fts_fuzzy": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
},
"vector": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
},
"splade": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
}
},
"fusion_analysis": {},
"overlaps": {}
},
{
"query": "database connection handling",
"methods": {
"fts_exact": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
},
"fts_fuzzy": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
},
"vector": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
},
"splade": {
"error": "'obj' object has no attribute 'symbol_boost_factor'",
"count": 0
}
},
"fusion_analysis": {},
"overlaps": {}
}
],
"summary": {
"fts_exact": {
"avg_count": 0.0,
"avg_latency_ms": 0
},
"fts_fuzzy": {
"avg_count": 0.0,
"avg_latency_ms": 0
},
"vector": {
"avg_count": 0.0,
"avg_latency_ms": 0
},
"splade": {
"avg_count": 0.0,
"avg_latency_ms": 0
}
}
},
"fusion_experiment": {
"per_query": [
{
"query": "binary quantization",
"strategies": {
"standard_hybrid": {
"error": "'obj' object has no attribute 'symbol_boost_factor'"
},
"fts_rerank_fusion": {
"error": "'obj' object has no attribute 'symbol_boost_factor'"
}
}
},
{
"query": "hamming distance search",
"strategies": {
"standard_hybrid": {
"error": "'obj' object has no attribute 'symbol_boost_factor'"
},
"fts_rerank_fusion": {
"error": "'obj' object has no attribute 'symbol_boost_factor'"
}
}
},
{
"query": "embeddings generation",
"strategies": {
"standard_hybrid": {
"error": "'obj' object has no attribute 'symbol_boost_factor'"
},
"fts_rerank_fusion": {
"error": "'obj' object has no attribute 'symbol_boost_factor'"
}
}
},
{
"query": "reranking algorithm",
"strategies": {
"standard_hybrid": {
"error": "'obj' object has no attribute 'symbol_boost_factor'"
},
"fts_rerank_fusion": {
"error": "'obj' object has no attribute 'symbol_boost_factor'"
}
}
},
{
"query": "database connection handling",
"strategies": {
"standard_hybrid": {
"error": "'obj' object has no attribute 'symbol_boost_factor'"
},
"fts_rerank_fusion": {
"error": "'obj' object has no attribute 'symbol_boost_factor'"
}
}
}
],
"summary": {}
}
}

View File

@@ -1,73 +0,0 @@
{
"summary": {
"timestamp": "2026-02-08 23:48:26",
"source": "src",
"k": 5,
"coarse_k": 50,
"query_count": 1,
"avg_jaccard_topk": 0.0,
"avg_rbo_topk": 0.0,
"staged": {
"success": 1,
"avg_latency_ms": 30093.97499999404
},
"dense_rerank": {
"success": 1,
"avg_latency_ms": 331.4424999952316
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 30093.97499999404,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\__init__.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 6421.706914901733,
"stage2_expand_ms": 17591.988563537598,
"stage3_cluster_ms": 3700.4549503326416,
"stage4_rerank_ms": 2340.064525604248
},
"stage_counts": {
"stage1_candidates": 50,
"stage2_expanded": 99,
"stage3_clustered": 10,
"stage4_reranked": 10
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 331.4424999952316,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.0,
"rbo_topk": 0.0,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 5,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 1
}
]
}

View File

@@ -1,177 +0,0 @@
{
"summary": {
"timestamp": "2026-02-08 23:58:56",
"source": "src",
"k": 5,
"coarse_k": 50,
"query_count": 3,
"avg_jaccard_topk": 0.11574074074074074,
"avg_rbo_topk": 0.14601366666666662,
"staged": {
"success": 3,
"avg_latency_ms": 27868.044033328693
},
"dense_rerank": {
"success": 3,
"avg_latency_ms": 1339.25289999942
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 33643.06179998815,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 6201.4524936676025,
"stage2_expand_ms": 17306.61702156067,
"stage3_cluster_ms": 6829.557418823242,
"stage4_rerank_ms": 3267.071485519409
},
"stage_counts": {
"stage1_candidates": 50,
"stage2_expanded": 99,
"stage3_clustered": 10,
"stage4_reranked": 10
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 1520.9955999851227,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.031347,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 5,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 1
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 26400.58900000155,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 404.60920333862305,
"stage2_expand_ms": 20036.258697509766,
"stage3_cluster_ms": 4919.439315795898,
"stage4_rerank_ms": 1001.8632411956787
},
"stage_counts": {
"stage1_candidates": 50,
"stage2_expanded": 51,
"stage3_clustered": 10,
"stage4_reranked": 10
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 1264.3862999975681,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.125,
"rbo_topk": 0.20334699999999994,
"staged_unique_files_topk": 4,
"dense_unique_files_topk": 5,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 2
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 23560.481299996376,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 385.28990745544434,
"stage2_expand_ms": 17787.648677825928,
"stage3_cluster_ms": 4374.642372131348,
"stage4_rerank_ms": 974.8115539550781
},
"stage_counts": {
"stage1_candidates": 50,
"stage2_expanded": 50,
"stage3_clustered": 10,
"stage4_reranked": 10
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 1232.3768000155687,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.20334699999999994,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 5,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 1
}
]
}

View File

@@ -1,176 +0,0 @@
{
"summary": {
"timestamp": "2026-02-09 00:08:47",
"source": "src",
"k": 5,
"coarse_k": 50,
"query_count": 3,
"avg_jaccard_topk": 0.11574074074074074,
"avg_rbo_topk": 0.14601366666666662,
"staged": {
"success": 3,
"avg_latency_ms": 31720.555866663653
},
"dense_rerank": {
"success": 3,
"avg_latency_ms": 1401.2113333245118
}
},
"comparisons": [
{
"query": "class Config",
"staged": {
"strategy": "staged",
"query": "class Config",
"latency_ms": 40162.88519999385,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 6091.366767883301,
"stage2_expand_ms": 17540.942907333374,
"stage3_cluster_ms": 13169.558048248291,
"stage4_rerank_ms": 3317.5392150878906
},
"stage_counts": {
"stage1_candidates": 50,
"stage2_expanded": 99,
"stage3_clustered": 10,
"stage4_reranked": 10
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "class Config",
"latency_ms": 1571.1398999989033,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\migration_manager.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\splade_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\sqlite_store.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.031347,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 5,
"staged_unique_dirs_topk": 5,
"dense_unique_dirs_topk": 1
},
{
"query": "def search",
"staged": {
"strategy": "staged",
"query": "def search",
"latency_ms": 31623.380899995565,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 400.84290504455566,
"stage2_expand_ms": 20529.58631515503,
"stage3_cluster_ms": 9625.348806381226,
"stage4_rerank_ms": 1027.686357498169
},
"stage_counts": {
"stage1_candidates": 50,
"stage2_expanded": 51,
"stage3_clustered": 10,
"stage4_reranked": 10
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "def search",
"latency_ms": 1376.3304999768734,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.125,
"rbo_topk": 0.20334699999999994,
"staged_unique_files_topk": 4,
"dense_unique_files_topk": 5,
"staged_unique_dirs_topk": 3,
"dense_unique_dirs_topk": 2
},
{
"query": "LspBridge",
"staged": {
"strategy": "staged",
"query": "LspBridge",
"latency_ms": 23375.40150000155,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\vector_meta_store.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\semantic\\code_extractor.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py"
],
"stage_stats": {
"stage_times": {
"stage1_binary_ms": 392.41671562194824,
"stage2_expand_ms": 17760.897397994995,
"stage3_cluster_ms": 4194.235563278198,
"stage4_rerank_ms": 990.307092666626
},
"stage_counts": {
"stage1_candidates": 50,
"stage2_expanded": 50,
"stage3_clustered": 10,
"stage4_reranked": 10
}
},
"error": null
},
"dense_rerank": {
"strategy": "dense_rerank",
"query": "LspBridge",
"latency_ms": 1256.1635999977589,
"num_results": 5,
"topk_paths": [
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\global_index.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
"d:\\claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py"
],
"stage_stats": null,
"error": null
},
"jaccard_topk": 0.1111111111111111,
"rbo_topk": 0.20334699999999994,
"staged_unique_files_topk": 5,
"dense_unique_files_topk": 5,
"staged_unique_dirs_topk": 4,
"dense_unique_dirs_topk": 1
}
]
}

View File

@@ -1,465 +0,0 @@
"""
CoIR Benchmark Evaluation Report Generator
Compares SPLADE with mainstream code retrieval models on CoIR benchmark tasks.
Generates comprehensive performance analysis report.
"""
import sys
import time
import json
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Tuple
import numpy as np
sys.path.insert(0, 'src')
# =============================================================================
# REFERENCE: Published CoIR Benchmark Scores (NDCG@10)
# Source: CoIR Paper (ACL 2025) - https://arxiv.org/abs/2407.02883
# =============================================================================
COIR_REFERENCE_SCORES = {
# Model: {dataset: NDCG@10 score}
"Voyage-Code-002": {
"APPS": 26.52, "CosQA": 29.79, "Text2SQL": 69.26, "CodeSearchNet": 81.79,
"CCR": 73.45, "Contest-DL": 72.77, "StackOverflow": 27.28,
"FB-ST": 87.68, "FB-MT": 65.35, "Average": 56.26
},
"E5-Mistral-7B": {
"APPS": 21.33, "CosQA": 31.27, "Text2SQL": 65.98, "CodeSearchNet": 54.25,
"CCR": 65.27, "Contest-DL": 82.55, "StackOverflow": 33.24,
"FB-ST": 91.54, "FB-MT": 72.71, "Average": 55.18
},
"E5-Base": {
"APPS": 11.52, "CosQA": 32.59, "Text2SQL": 52.31, "CodeSearchNet": 67.99,
"CCR": 56.87, "Contest-DL": 62.50, "StackOverflow": 21.87,
"FB-ST": 86.86, "FB-MT": 74.52, "Average": 50.90
},
"OpenAI-Ada-002": {
"APPS": 8.70, "CosQA": 28.88, "Text2SQL": 58.32, "CodeSearchNet": 74.21,
"CCR": 69.13, "Contest-DL": 53.34, "StackOverflow": 26.04,
"FB-ST": 72.40, "FB-MT": 47.12, "Average": 45.59
},
"BGE-Base": {
"APPS": 4.05, "CosQA": 32.76, "Text2SQL": 45.59, "CodeSearchNet": 69.60,
"CCR": 45.56, "Contest-DL": 38.50, "StackOverflow": 21.71,
"FB-ST": 73.55, "FB-MT": 64.99, "Average": 42.77
},
"BGE-M3": {
"APPS": 7.37, "CosQA": 22.73, "Text2SQL": 48.76, "CodeSearchNet": 43.23,
"CCR": 47.55, "Contest-DL": 47.86, "StackOverflow": 31.16,
"FB-ST": 61.04, "FB-MT": 49.94, "Average": 39.31
},
"UniXcoder": {
"APPS": 1.36, "CosQA": 25.14, "Text2SQL": 50.45, "CodeSearchNet": 60.20,
"CCR": 58.36, "Contest-DL": 41.82, "StackOverflow": 31.03,
"FB-ST": 44.67, "FB-MT": 36.02, "Average": 37.33
},
"GTE-Base": {
"APPS": 3.24, "CosQA": 30.24, "Text2SQL": 46.19, "CodeSearchNet": 43.35,
"CCR": 35.50, "Contest-DL": 33.81, "StackOverflow": 28.80,
"FB-ST": 62.71, "FB-MT": 55.19, "Average": 36.75
},
"Contriever": {
"APPS": 5.14, "CosQA": 14.21, "Text2SQL": 45.46, "CodeSearchNet": 34.72,
"CCR": 35.74, "Contest-DL": 44.16, "StackOverflow": 24.21,
"FB-ST": 66.05, "FB-MT": 55.11, "Average": 36.40
},
}
# Recent models (2025)
RECENT_MODELS = {
"Voyage-Code-3": {"Average": 62.5, "note": "13.8% better than OpenAI-v3-large"},
"SFR-Embedding-Code-7B": {"Average": 67.4, "note": "#1 on CoIR (Feb 2025)"},
"Jina-Code-v2": {"CosQA": 41.0, "note": "Strong on CosQA"},
"CodeSage-Large": {"Average": 53.5, "note": "Specialized code model"},
}
# =============================================================================
# TEST DATA: Synthetic CoIR-like datasets for local evaluation
# =============================================================================
def create_test_datasets():
"""Create synthetic test datasets mimicking CoIR task types."""
# Text-to-Code (like CosQA, CodeSearchNet)
text_to_code = {
"name": "Text-to-Code",
"description": "Natural language queries to code snippets",
"corpus": [
{"id": "c1", "text": "def authenticate_user(username: str, password: str) -> bool:\n user = db.get_user(username)\n if user and verify_hash(password, user.password_hash):\n return True\n return False"},
{"id": "c2", "text": "async function fetchUserData(userId) {\n const response = await fetch(`/api/users/${userId}`);\n if (!response.ok) throw new Error('User not found');\n return response.json();\n}"},
{"id": "c3", "text": "def calculate_statistics(data: List[float]) -> Dict[str, float]:\n return {\n 'mean': np.mean(data),\n 'std': np.std(data),\n 'median': np.median(data)\n }"},
{"id": "c4", "text": "SELECT u.id, u.name, u.email, COUNT(o.id) as order_count\nFROM users u LEFT JOIN orders o ON u.id = o.user_id\nWHERE u.status = 'active'\nGROUP BY u.id, u.name, u.email"},
{"id": "c5", "text": "def merge_sort(arr: List[int]) -> List[int]:\n if len(arr) <= 1:\n return arr\n mid = len(arr) // 2\n left = merge_sort(arr[:mid])\n right = merge_sort(arr[mid:])\n return merge(left, right)"},
{"id": "c6", "text": "app.post('/api/auth/login', async (req, res) => {\n const { email, password } = req.body;\n const user = await User.findByEmail(email);\n if (!user || !await bcrypt.compare(password, user.password)) {\n return res.status(401).json({ error: 'Invalid credentials' });\n }\n const token = jwt.sign({ userId: user.id }, process.env.JWT_SECRET);\n res.json({ token });\n});"},
{"id": "c7", "text": "CREATE TABLE products (\n id SERIAL PRIMARY KEY,\n name VARCHAR(255) NOT NULL,\n price DECIMAL(10, 2) NOT NULL,\n category_id INTEGER REFERENCES categories(id),\n created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n);"},
{"id": "c8", "text": "def read_json_file(filepath: str) -> Dict:\n with open(filepath, 'r', encoding='utf-8') as f:\n return json.load(f)"},
{"id": "c9", "text": "class UserRepository:\n def __init__(self, session):\n self.session = session\n \n def find_by_email(self, email: str) -> Optional[User]:\n return self.session.query(User).filter(User.email == email).first()"},
{"id": "c10", "text": "try:\n result = await process_data(input_data)\nexcept ValidationError as e:\n logger.error(f'Validation failed: {e}')\n raise HTTPException(status_code=400, detail=str(e))\nexcept DatabaseError as e:\n logger.critical(f'Database error: {e}')\n raise HTTPException(status_code=500, detail='Internal server error')"},
],
"queries": [
{"id": "q1", "text": "function to verify user password and authenticate", "relevant": ["c1", "c6"]},
{"id": "q2", "text": "async http request to fetch user data", "relevant": ["c2"]},
{"id": "q3", "text": "calculate mean median standard deviation statistics", "relevant": ["c3"]},
{"id": "q4", "text": "SQL query join users and orders count", "relevant": ["c4", "c7"]},
{"id": "q5", "text": "recursive sorting algorithm implementation", "relevant": ["c5"]},
{"id": "q6", "text": "REST API login endpoint with JWT token", "relevant": ["c6", "c1"]},
{"id": "q7", "text": "create database table with foreign key", "relevant": ["c7"]},
{"id": "q8", "text": "read and parse JSON file python", "relevant": ["c8"]},
{"id": "q9", "text": "repository pattern find user by email", "relevant": ["c9", "c1"]},
{"id": "q10", "text": "exception handling with logging", "relevant": ["c10"]},
]
}
# Code-to-Code (like CCR)
code_to_code = {
"name": "Code-to-Code",
"description": "Find similar code implementations",
"corpus": [
{"id": "c1", "text": "def add(a, b): return a + b"},
{"id": "c2", "text": "function sum(x, y) { return x + y; }"},
{"id": "c3", "text": "func add(a int, b int) int { return a + b }"},
{"id": "c4", "text": "def subtract(a, b): return a - b"},
{"id": "c5", "text": "def multiply(a, b): return a * b"},
{"id": "c6", "text": "const add = (a, b) => a + b;"},
{"id": "c7", "text": "fn add(a: i32, b: i32) -> i32 { a + b }"},
{"id": "c8", "text": "public int add(int a, int b) { return a + b; }"},
],
"queries": [
{"id": "q1", "text": "def add(a, b): return a + b", "relevant": ["c1", "c2", "c3", "c6", "c7", "c8"]},
{"id": "q2", "text": "def subtract(x, y): return x - y", "relevant": ["c4"]},
{"id": "q3", "text": "def mult(x, y): return x * y", "relevant": ["c5"]},
]
}
# Text2SQL
text2sql = {
"name": "Text2SQL",
"description": "Natural language to SQL queries",
"corpus": [
{"id": "c1", "text": "SELECT * FROM users WHERE active = 1"},
{"id": "c2", "text": "SELECT COUNT(*) FROM orders WHERE status = 'pending'"},
{"id": "c3", "text": "SELECT u.name, SUM(o.total) FROM users u JOIN orders o ON u.id = o.user_id GROUP BY u.name"},
{"id": "c4", "text": "UPDATE products SET price = price * 1.1 WHERE category = 'electronics'"},
{"id": "c5", "text": "DELETE FROM sessions WHERE expires_at < NOW()"},
{"id": "c6", "text": "INSERT INTO users (name, email) VALUES ('John', 'john@example.com')"},
],
"queries": [
{"id": "q1", "text": "get all active users", "relevant": ["c1"]},
{"id": "q2", "text": "count pending orders", "relevant": ["c2"]},
{"id": "q3", "text": "total order amount by user", "relevant": ["c3"]},
{"id": "q4", "text": "increase electronics prices by 10%", "relevant": ["c4"]},
{"id": "q5", "text": "remove expired sessions", "relevant": ["c5"]},
{"id": "q6", "text": "add new user", "relevant": ["c6"]},
]
}
return [text_to_code, code_to_code, text2sql]
# =============================================================================
# EVALUATION FUNCTIONS
# =============================================================================
def ndcg_at_k(ranked_list: List[str], relevant: List[str], k: int = 10) -> float:
"""Calculate NDCG@k."""
dcg = 0.0
for i, doc_id in enumerate(ranked_list[:k]):
if doc_id in relevant:
dcg += 1.0 / np.log2(i + 2)
# Ideal DCG
ideal_k = min(len(relevant), k)
idcg = sum(1.0 / np.log2(i + 2) for i in range(ideal_k))
return dcg / idcg if idcg > 0 else 0.0
def precision_at_k(ranked_list: List[str], relevant: List[str], k: int = 10) -> float:
"""Calculate Precision@k."""
retrieved = set(ranked_list[:k])
relevant_set = set(relevant)
return len(retrieved & relevant_set) / k
def recall_at_k(ranked_list: List[str], relevant: List[str], k: int = 10) -> float:
"""Calculate Recall@k."""
retrieved = set(ranked_list[:k])
relevant_set = set(relevant)
return len(retrieved & relevant_set) / len(relevant_set) if relevant_set else 0.0
def mrr(ranked_list: List[str], relevant: List[str]) -> float:
"""Calculate Mean Reciprocal Rank."""
for i, doc_id in enumerate(ranked_list):
if doc_id in relevant:
return 1.0 / (i + 1)
return 0.0
def evaluate_model(model_name: str, encode_fn, datasets: List[Dict]) -> Dict:
"""Evaluate a model on all datasets."""
results = {}
for dataset in datasets:
corpus = dataset["corpus"]
queries = dataset["queries"]
corpus_ids = [doc["id"] for doc in corpus]
corpus_texts = [doc["text"] for doc in corpus]
corpus_embs = encode_fn(corpus_texts)
metrics = {"ndcg@10": [], "precision@10": [], "recall@10": [], "mrr": []}
for query in queries:
query_emb = encode_fn([query["text"]])[0]
# Compute similarity scores
if hasattr(corpus_embs, 'shape') and len(corpus_embs.shape) == 2:
# Dense vectors - cosine similarity
q_norm = query_emb / (np.linalg.norm(query_emb) + 1e-8)
c_norm = corpus_embs / (np.linalg.norm(corpus_embs, axis=1, keepdims=True) + 1e-8)
scores = np.dot(c_norm, q_norm)
else:
# Sparse - dot product
scores = np.array([np.dot(c, query_emb) for c in corpus_embs])
ranked_indices = np.argsort(scores)[::-1]
ranked_ids = [corpus_ids[i] for i in ranked_indices]
relevant = query["relevant"]
metrics["ndcg@10"].append(ndcg_at_k(ranked_ids, relevant, 10))
metrics["precision@10"].append(precision_at_k(ranked_ids, relevant, 10))
metrics["recall@10"].append(recall_at_k(ranked_ids, relevant, 10))
metrics["mrr"].append(mrr(ranked_ids, relevant))
results[dataset["name"]] = {k: np.mean(v) * 100 for k, v in metrics.items()}
# Calculate average
all_ndcg = [results[d["name"]]["ndcg@10"] for d in datasets]
results["Average"] = {
"ndcg@10": np.mean(all_ndcg),
"note": "Average across all datasets"
}
return results
# =============================================================================
# MODEL IMPLEMENTATIONS
# =============================================================================
def get_splade_encoder():
"""Get SPLADE encoding function."""
from codexlens.semantic.splade_encoder import get_splade_encoder as _get_splade
encoder = _get_splade()
def encode(texts):
sparse_vecs = encoder.encode_batch(texts) if len(texts) > 1 else [encoder.encode_text(texts[0])]
# Convert to dense for comparison
vocab_size = encoder.vocab_size
dense = np.zeros((len(sparse_vecs), vocab_size), dtype=np.float32)
for i, sv in enumerate(sparse_vecs):
for tid, w in sv.items():
dense[i, tid] = w
return dense
return encode
def get_dense_encoder(model_name: str = "all-MiniLM-L6-v2"):
"""Get dense embedding encoding function."""
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(model_name)
def encode(texts):
return model.encode(texts, show_progress_bar=False)
return encode
# =============================================================================
# REPORT GENERATION
# =============================================================================
def generate_report(local_results: Dict, output_path: str = None):
"""Generate comprehensive benchmark report."""
report = []
report.append("=" * 80)
report.append("CODE RETRIEVAL BENCHMARK REPORT")
report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append("=" * 80)
# Section 1: Reference Benchmark Scores
report.append("\n## 1. CoIR Benchmark Reference Scores (Published)")
report.append("\nSource: CoIR Paper (ACL 2025) - https://arxiv.org/abs/2407.02883")
report.append("\n### NDCG@10 Scores by Model and Dataset\n")
# Header
datasets = ["APPS", "CosQA", "Text2SQL", "CodeSearchNet", "CCR", "Contest-DL", "StackOverflow", "FB-ST", "FB-MT", "Average"]
header = "| Model | " + " | ".join(datasets) + " |"
separator = "|" + "|".join(["---"] * (len(datasets) + 1)) + "|"
report.append(header)
report.append(separator)
# Data rows
for model, scores in COIR_REFERENCE_SCORES.items():
row = f"| {model} | " + " | ".join([f"{scores.get(d, '-'):.2f}" if isinstance(scores.get(d), (int, float)) else str(scores.get(d, '-')) for d in datasets]) + " |"
report.append(row)
# Section 2: Recent Models
report.append("\n### Recent Top Performers (2025)\n")
report.append("| Model | Average NDCG@10 | Notes |")
report.append("|-------|-----------------|-------|")
for model, info in RECENT_MODELS.items():
avg = info.get("Average", "-")
note = info.get("note", "")
report.append(f"| {model} | {avg} | {note} |")
# Section 3: Local Evaluation Results
report.append("\n## 2. Local Evaluation Results\n")
report.append("Evaluated on synthetic CoIR-like datasets\n")
for model_name, results in local_results.items():
report.append(f"\n### {model_name}\n")
report.append("| Dataset | NDCG@10 | Precision@10 | Recall@10 | MRR |")
report.append("|---------|---------|--------------|-----------|-----|")
for dataset_name, metrics in results.items():
if dataset_name == "Average":
continue
ndcg = metrics.get("ndcg@10", 0)
prec = metrics.get("precision@10", 0)
rec = metrics.get("recall@10", 0)
m = metrics.get("mrr", 0)
report.append(f"| {dataset_name} | {ndcg:.2f} | {prec:.2f} | {rec:.2f} | {m:.2f} |")
if "Average" in results:
avg = results["Average"]["ndcg@10"]
report.append(f"| **Average** | **{avg:.2f}** | - | - | - |")
# Section 4: Comparison Analysis
report.append("\n## 3. Comparison Analysis\n")
if "SPLADE" in local_results and "Dense (MiniLM)" in local_results:
splade_avg = local_results["SPLADE"]["Average"]["ndcg@10"]
dense_avg = local_results["Dense (MiniLM)"]["Average"]["ndcg@10"]
report.append("### SPLADE vs Dense Embedding\n")
report.append(f"- SPLADE Average NDCG@10: {splade_avg:.2f}")
report.append(f"- Dense (MiniLM) Average NDCG@10: {dense_avg:.2f}")
if splade_avg > dense_avg:
diff = ((splade_avg - dense_avg) / dense_avg) * 100
report.append(f"- SPLADE outperforms by {diff:.1f}%")
else:
diff = ((dense_avg - splade_avg) / splade_avg) * 100
report.append(f"- Dense outperforms by {diff:.1f}%")
# Section 5: Key Insights
report.append("\n## 4. Key Insights\n")
report.append("""
1. **Voyage-Code-002** achieved highest mean score (56.26) on original CoIR benchmark
2. **SFR-Embedding-Code-7B** (Salesforce) reached #1 in Feb 2025 with 67.4 average
3. **SPLADE** provides good balance of:
- Interpretability (visible token activations)
- Query expansion (learned synonyms)
- Efficient sparse retrieval
4. **Task-specific performance varies significantly**:
- E5-Mistral excels at Contest-DL (82.55) but median on APPS
- Voyage-Code-002 excels at CodeSearchNet (81.79)
- No single model dominates all tasks
5. **Hybrid approaches recommended**:
- Combine sparse (SPLADE/BM25) with dense for best results
- Use RRF (Reciprocal Rank Fusion) for score combination
""")
# Section 6: Recommendations
report.append("\n## 5. Recommendations for Codex-lens\n")
report.append("""
| Use Case | Recommended Approach |
|----------|---------------------|
| General code search | SPLADE + Dense hybrid |
| Exact keyword match | FTS (BM25) |
| Semantic understanding | Dense embedding |
| Interpretable results | SPLADE only |
| Maximum accuracy | SFR-Embedding-Code + SPLADE fusion |
""")
report_text = "\n".join(report)
if output_path:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(report_text)
print(f"Report saved to: {output_path}")
return report_text
# =============================================================================
# MAIN
# =============================================================================
def main():
print("=" * 80)
print("CODE RETRIEVAL BENCHMARK EVALUATION")
print("=" * 80)
# Create test datasets
print("\nCreating test datasets...")
datasets = create_test_datasets()
print(f" Created {len(datasets)} datasets")
local_results = {}
# Evaluate SPLADE
print("\nEvaluating SPLADE...")
try:
from codexlens.semantic.splade_encoder import check_splade_available
ok, err = check_splade_available()
if ok:
start = time.perf_counter()
splade_encode = get_splade_encoder()
splade_results = evaluate_model("SPLADE", splade_encode, datasets)
elapsed = time.perf_counter() - start
local_results["SPLADE"] = splade_results
print(f" SPLADE evaluated in {elapsed:.2f}s")
print(f" Average NDCG@10: {splade_results['Average']['ndcg@10']:.2f}")
else:
print(f" SPLADE not available: {err}")
except Exception as e:
print(f" SPLADE evaluation failed: {e}")
# Evaluate Dense (MiniLM)
print("\nEvaluating Dense (all-MiniLM-L6-v2)...")
try:
start = time.perf_counter()
dense_encode = get_dense_encoder("all-MiniLM-L6-v2")
dense_results = evaluate_model("Dense (MiniLM)", dense_encode, datasets)
elapsed = time.perf_counter() - start
local_results["Dense (MiniLM)"] = dense_results
print(f" Dense evaluated in {elapsed:.2f}s")
print(f" Average NDCG@10: {dense_results['Average']['ndcg@10']:.2f}")
except Exception as e:
print(f" Dense evaluation failed: {e}")
# Generate report
print("\nGenerating report...")
report = generate_report(local_results, "benchmark_report.md")
print("\n" + "=" * 80)
print("BENCHMARK COMPLETE")
print("=" * 80)
print("\nReport preview:\n")
print(report[:3000] + "\n...[truncated]...")
return local_results
if __name__ == "__main__":
main()

View File

@@ -1,318 +0,0 @@
#!/usr/bin/env python
"""Debug script to trace semantic search (dense_rerank) flow step by step."""
import json
import logging
import sqlite3
import sys
from pathlib import Path
from typing import Any, Dict, List, Tuple
# Add src to path
sys.path.insert(0, str(Path(__file__).parent / "src"))
# Configure detailed logging
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s | %(levelname)-5s | %(name)s | %(message)s",
datefmt="%H:%M:%S",
)
# Enable debug for specific modules
for name in ["codexlens.search", "codexlens.semantic", "codexlens.indexing"]:
logging.getLogger(name).setLevel(logging.DEBUG)
logger = logging.getLogger("debug_semantic")
def load_config() -> Dict[str, Any]:
"""Load config from codexlens settings."""
config_path = Path.home() / ".codexlens" / "config.json"
if config_path.exists():
with open(config_path) as f:
return json.load(f)
return {}
def inspect_hnsw_index(index_root: Path) -> Dict[str, Any]:
"""Inspect centralized HNSW index metadata."""
hnsw_path = index_root / "_vectors.hnsw"
meta_path = index_root / "_vectors_meta.db"
result = {
"hnsw_exists": hnsw_path.exists(),
"meta_exists": meta_path.exists(),
"hnsw_size_mb": round(hnsw_path.stat().st_size / (1024*1024), 2) if hnsw_path.exists() else 0,
}
if meta_path.exists():
conn = sqlite3.connect(str(meta_path))
cursor = conn.execute("SELECT COUNT(*) FROM chunk_metadata")
result["total_chunks"] = cursor.fetchone()[0]
# Sample file paths
cursor = conn.execute("""
SELECT DISTINCT file_path FROM chunk_metadata
ORDER BY file_path LIMIT 20
""")
result["sample_files"] = [row[0] for row in cursor.fetchall()]
# Check if tests vs src
cursor = conn.execute("""
SELECT
CASE
WHEN file_path LIKE '%tests%' OR file_path LIKE '%test_%' THEN 'test'
ELSE 'src'
END as category,
COUNT(*) as count
FROM chunk_metadata
GROUP BY category
""")
result["category_distribution"] = {row[0]: row[1] for row in cursor.fetchall()}
conn.close()
return result
def run_dense_search(query: str, index_root: Path, top_k: int = 50) -> List[Tuple[int, float, str]]:
"""Execute dense vector search and return candidates with details."""
from codexlens.semantic.ann_index import ANNIndex
from codexlens.semantic.factory import get_embedder
from codexlens.semantic.vector_store import VectorStore
logger.info("=" * 60)
logger.info("STAGE 1: Dense Embedding Generation")
logger.info("=" * 60)
# Read model config from index
index_db = index_root / "_index.db"
embedding_model = "qwen3-embedding-sf"
embedding_backend = "litellm"
if index_db.exists():
try:
with VectorStore(index_db) as vs:
model_config = vs.get_model_config()
if model_config:
embedding_backend = model_config.get("backend", embedding_backend)
embedding_model = model_config.get("model_name", embedding_model)
logger.info(f"Model config from index: {embedding_backend}/{embedding_model}")
except Exception as e:
logger.warning(f"Failed to read model config: {e}")
# Generate query embedding
embedder = get_embedder(backend=embedding_backend, model=embedding_model)
query_embedding = embedder.embed_to_numpy([query])[0]
logger.info(f"Query: {query!r}")
logger.info(f"Query embedding dim: {query_embedding.shape[0]}")
logger.info(f"Query embedding norm: {(query_embedding**2).sum()**0.5:.4f}")
# Load HNSW index
logger.info("=" * 60)
logger.info("STAGE 2: HNSW Vector Search (Coarse)")
logger.info("=" * 60)
ann_index = ANNIndex.create_central(
index_root=index_root,
dim=query_embedding.shape[0],
)
if not ann_index.load():
logger.error("Failed to load HNSW index")
return []
logger.info(f"HNSW index count: {ann_index.count()}")
# Execute search
ids, distances = ann_index.search(query_embedding, top_k=top_k)
logger.info(f"Found {len(ids)} candidates")
# Get chunk details
candidates = []
meta_path = index_root / "_vectors_meta.db"
if meta_path.exists():
conn = sqlite3.connect(str(meta_path))
conn.row_factory = sqlite3.Row
for chunk_id, distance in zip(ids, distances):
cursor = conn.execute("""
SELECT file_path, content, start_line, end_line
FROM chunk_metadata WHERE chunk_id = ?
""", (int(chunk_id),))
row = cursor.fetchone()
if row:
candidates.append((
int(chunk_id),
float(distance),
row["file_path"],
row["content"][:200] if row["content"] else "",
row["start_line"],
row["end_line"],
))
conn.close()
# Print top candidates
logger.info("\nTop 20 Dense Search Candidates:")
logger.info("-" * 80)
for i, (cid, dist, path, content, start, end) in enumerate(candidates[:20]):
score = max(0, 1 - dist)
is_test = "tests/" in path or "test_" in Path(path).name
marker = "[TEST]" if is_test else "[SRC]"
logger.info(f"{i+1:2d}. {marker} dist={dist:.4f} score={score:.4f}")
logger.info(f" {path}:{start}-{end}")
logger.info(f" {content[:100]}...")
logger.info("")
return candidates
def run_reranking(query: str, candidates: List[Tuple], top_k: int = 10) -> List[Tuple[str, float, float]]:
"""Execute cross-encoder reranking on candidates."""
from codexlens.semantic.reranker import get_reranker, check_reranker_available
logger.info("=" * 60)
logger.info("STAGE 3: Cross-Encoder Reranking")
logger.info("=" * 60)
# Check reranker availability
config = load_config()
backend = config.get("reranker_backend", "api")
model = config.get("reranker_model", "Qwen/Qwen3-Reranker-8B")
logger.info(f"Reranker backend: {backend}")
logger.info(f"Reranker model: {model}")
ok, err = check_reranker_available(backend)
if not ok:
logger.error(f"Reranker not available: {err}")
return []
reranker = get_reranker(backend=backend, model_name=model)
# Prepare pairs for reranking
pairs = []
for cid, dist, path, content, start, end in candidates[:50]: # Top 50 for reranking
doc_text = content if content else path
pairs.append((query, doc_text))
logger.info(f"Reranking {len(pairs)} candidates...")
# Execute reranking
scores = reranker.score_pairs(pairs, batch_size=32)
# Combine scores
results = []
for i, (cid, dist, path, content, start, end) in enumerate(candidates[:len(scores)]):
dense_score = max(0, 1 - dist)
rerank_score = scores[i]
combined = 0.5 * dense_score + 0.5 * rerank_score
is_test = "tests/" in path or "test_" in Path(path).name
results.append((path, dense_score, rerank_score, combined, is_test, content[:100]))
# Sort by combined score
results.sort(key=lambda x: x[3], reverse=True)
logger.info("\nTop 20 Reranked Results:")
logger.info("-" * 100)
logger.info(f"{'Rank':>4} {'Type':^6} {'Dense':^8} {'Rerank':^8} {'Combined':^8} Path")
logger.info("-" * 100)
for i, (path, dense, rerank, combined, is_test, content) in enumerate(results[:20]):
marker = "TEST" if is_test else "SRC"
logger.info(f"{i+1:4d} [{marker:^4}] {dense:8.4f} {rerank:8.4f} {combined:8.4f} {path}")
return results[:top_k]
def analyze_problem(candidates: List[Tuple], results: List[Tuple]):
"""Analyze why tests might rank higher than src files."""
logger.info("=" * 60)
logger.info("ANALYSIS: Why Tests Rank Higher?")
logger.info("=" * 60)
# Count test vs src in dense candidates
test_in_dense = sum(1 for c in candidates[:50] if "tests/" in c[2] or "test_" in Path(c[2]).name)
src_in_dense = 50 - test_in_dense
logger.info(f"\nDense Search (top 50):")
logger.info(f" - Test files: {test_in_dense} ({test_in_dense*2}%)")
logger.info(f" - Src files: {src_in_dense} ({src_in_dense*2}%)")
# Average scores by category
test_dense_scores = [max(0, 1-c[1]) for c in candidates[:50] if "tests/" in c[2] or "test_" in Path(c[2]).name]
src_dense_scores = [max(0, 1-c[1]) for c in candidates[:50] if not ("tests/" in c[2] or "test_" in Path(c[2]).name)]
if test_dense_scores:
logger.info(f"\nDense Score Averages:")
logger.info(f" - Test files: {sum(test_dense_scores)/len(test_dense_scores):.4f}")
if src_dense_scores:
logger.info(f" - Src files: {sum(src_dense_scores)/len(src_dense_scores):.4f}")
# Check rerank score distribution
test_results = [r for r in results if r[4]]
src_results = [r for r in results if not r[4]]
if test_results and src_results:
logger.info(f"\nRerank Score Averages:")
logger.info(f" - Test files: {sum(r[2] for r in test_results)/len(test_results):.4f}")
logger.info(f" - Src files: {sum(r[2] for r in src_results)/len(src_results):.4f}")
logger.info("\n" + "=" * 60)
logger.info("HYPOTHESIS:")
logger.info("=" * 60)
if test_in_dense > src_in_dense:
logger.info("→ Problem is at DENSE SEARCH stage")
logger.info(" Test files have embeddings closer to query")
logger.info(" Possible causes:")
logger.info(" 1. Test files mention implementation concepts in comments/docstrings")
logger.info(" 2. Embedding model doesn't distinguish between tests and implementation")
logger.info(" 3. Test file chunks are more frequent in the index")
else:
logger.info("→ Problem may be at RERANKING stage")
logger.info(" Reranker gives higher scores to test content")
def main():
query = "文件索引和嵌入向量生成的实现逻辑"
index_root = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3")
logger.info("=" * 60)
logger.info("DEBUG: Semantic Search Analysis")
logger.info("=" * 60)
logger.info(f"Query: {query}")
logger.info(f"Index root: {index_root}")
logger.info("")
# Step 1: Inspect index
logger.info("STEP 0: Index Inspection")
logger.info("-" * 60)
index_info = inspect_hnsw_index(index_root)
for k, v in index_info.items():
if k == "sample_files":
logger.info(f" {k}:")
for f in v[:10]:
logger.info(f" - {f}")
elif k == "category_distribution":
logger.info(f" {k}:")
for cat, count in v.items():
logger.info(f" - {cat}: {count}")
else:
logger.info(f" {k}: {v}")
logger.info("")
# Step 2: Dense search
candidates = run_dense_search(query, index_root, top_k=100)
if not candidates:
logger.error("No candidates from dense search")
return
# Step 3: Reranking
results = run_reranking(query, candidates, top_k=20)
# Step 4: Analyze
analyze_problem(candidates, results)
if __name__ == "__main__":
main()

View File

@@ -1,276 +0,0 @@
#!/usr/bin/env python
"""Debug script v2: Trace the full semantic search flow with detailed logging."""
import json
import logging
import sqlite3
import sys
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Tuple
# Add src to path
sys.path.insert(0, str(Path(__file__).parent / "src"))
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)-5s | %(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger("debug")
def count_chunks_by_category(index_root: Path) -> Dict[str, int]:
"""Count chunks by category (src vs test) across all indexes."""
counts = defaultdict(int)
for db_path in index_root.rglob("_index.db"):
try:
conn = sqlite3.connect(str(db_path))
cursor = conn.execute("""
SELECT file_path FROM semantic_chunks
""")
for row in cursor:
path = row[0]
if "tests" in path or "test_" in Path(path).name:
counts["test"] += 1
else:
counts["src"] += 1
conn.close()
except:
pass
return dict(counts)
def run_dense_search_with_trace(query: str, source_path: Path) -> List[Dict]:
"""Run dense search with detailed tracing."""
from codexlens.config import Config
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
from codexlens.storage.registry import Registry
from codexlens.storage.path_mapper import PathMapper
# Load config
config = Config.load()
registry = Registry(config.data_dir)
mapper = PathMapper(config.data_dir)
# Create search engine with verbose logging
engine = ChainSearchEngine(registry, mapper, config=config)
engine.logger.setLevel(logging.DEBUG)
# Set up handler to capture all log output
handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)
engine.logger.addHandler(handler)
# Execute cascade search with dense_rerank strategy
options = SearchOptions(depth=-1) # Search all subdirectories
logger.info("=" * 70)
logger.info("Executing dense_rerank cascade search...")
logger.info(f"Query: {query}")
logger.info(f"Source: {source_path}")
logger.info("=" * 70)
result = engine.cascade_search(
query=query,
source_path=source_path,
k=20,
coarse_k=100,
options=options,
strategy="dense_rerank"
)
# Analyze results
logger.info("\n" + "=" * 70)
logger.info("SEARCH RESULTS ANALYSIS")
logger.info("=" * 70)
test_count = 0
src_count = 0
results_detail = []
for i, r in enumerate(result.results):
is_test = "tests" in r.path or "test_" in Path(r.path).name
if is_test:
test_count += 1
category = "TEST"
else:
src_count += 1
category = "SRC"
# Get metadata scores if available
pre_ce_score = r.metadata.get("pre_cross_encoder_score", r.score)
ce_score = r.metadata.get("cross_encoder_score", 0)
ce_prob = r.metadata.get("cross_encoder_prob", 0)
results_detail.append({
"rank": i + 1,
"category": category,
"path": r.path,
"score": r.score,
"pre_ce_score": pre_ce_score,
"ce_score": ce_score,
"ce_prob": ce_prob,
"excerpt": r.excerpt[:100] if r.excerpt else "",
})
logger.info(f"{i+1:2d}. [{category:4s}] score={r.score:.4f} pre_ce={pre_ce_score:.4f} ce={ce_score:.4f}")
logger.info(f" {r.path}")
if r.excerpt:
logger.info(f" {r.excerpt[:80]}...")
logger.info("")
logger.info(f"\nSummary: {src_count} SRC files, {test_count} TEST files in top {len(result.results)}")
logger.info(f"Search time: {result.stats.time_ms:.2f}ms")
return results_detail
def compare_coarse_candidates():
"""Compare coarse candidates before and after reranking."""
from codexlens.config import Config
from codexlens.semantic.factory import get_embedder
from codexlens.semantic.ann_index import ANNIndex
query = "文件索引和嵌入向量生成的实现逻辑"
config = Config.load()
# Generate query embedding
embedder = get_embedder(backend="litellm", model="qwen3-embedding-sf")
query_embedding = embedder.embed_to_numpy([query])[0]
logger.info("=" * 70)
logger.info("COARSE CANDIDATE ANALYSIS (per directory)")
logger.info("=" * 70)
# Scan all HNSW indexes
index_root = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens")
all_candidates = []
for hnsw_path in index_root.rglob("_index_vectors.hnsw"):
db_path = hnsw_path.parent / "_index.db"
if not db_path.exists():
continue
try:
ann_index = ANNIndex(db_path, dim=query_embedding.shape[0])
if not ann_index.load() or ann_index.count() == 0:
continue
ids, distances = ann_index.search(query_embedding, top_k=10)
# Get file paths from chunks
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
dir_name = hnsw_path.parent.relative_to(index_root)
for chunk_id, dist in zip(ids, distances):
cursor = conn.execute("""
SELECT file_path, content FROM semantic_chunks WHERE id = ?
""", (int(chunk_id),))
row = cursor.fetchone()
if row:
is_test = "tests" in row["file_path"] or "test_" in Path(row["file_path"]).name
all_candidates.append({
"dir": str(dir_name),
"chunk_id": int(chunk_id),
"distance": float(dist),
"score": max(0, 1 - float(dist)),
"is_test": is_test,
"file_path": row["file_path"],
"content_preview": row["content"][:100] if row["content"] else ""
})
conn.close()
except Exception as e:
logger.warning(f"Error processing {hnsw_path}: {e}")
# Sort by distance (closest first)
all_candidates.sort(key=lambda x: x["distance"])
logger.info(f"\nTotal coarse candidates across all directories: {len(all_candidates)}")
# Analyze distribution
test_candidates = [c for c in all_candidates if c["is_test"]]
src_candidates = [c for c in all_candidates if not c["is_test"]]
logger.info(f"Test files: {len(test_candidates)}")
logger.info(f"Src files: {len(src_candidates)}")
if test_candidates:
avg_test_dist = sum(c["distance"] for c in test_candidates) / len(test_candidates)
logger.info(f"Avg test distance: {avg_test_dist:.4f}")
if src_candidates:
avg_src_dist = sum(c["distance"] for c in src_candidates) / len(src_candidates)
logger.info(f"Avg src distance: {avg_src_dist:.4f}")
logger.info("\nTop 30 candidates (combined from all directories):")
logger.info("-" * 90)
for i, c in enumerate(all_candidates[:30]):
cat = "TEST" if c["is_test"] else "SRC"
logger.info(f"{i+1:2d}. [{cat:4s}] dist={c['distance']:.4f} score={c['score']:.4f} dir={c['dir']}")
logger.info(f" {Path(c['file_path']).name}")
return all_candidates
def main():
logger.info("=" * 70)
logger.info("SEMANTIC SEARCH DEBUG SESSION")
logger.info("=" * 70)
# Step 1: Count chunks distribution
index_root = Path(r"C:\Users\dyw\.codexlens\indexes\D\Claude_dms3\codex-lens")
counts = count_chunks_by_category(index_root)
logger.info(f"\nChunk distribution in index:")
logger.info(f" - Test chunks: {counts.get('test', 0)}")
logger.info(f" - Src chunks: {counts.get('src', 0)}")
# Step 2: Compare coarse candidates
logger.info("\n")
candidates = compare_coarse_candidates()
# Step 3: Run full search
logger.info("\n")
query = "文件索引和嵌入向量生成的实现逻辑"
source_path = Path(r"D:\Claude_dms3\codex-lens")
results = run_dense_search_with_trace(query, source_path)
# Summary
logger.info("\n" + "=" * 70)
logger.info("ROOT CAUSE ANALYSIS")
logger.info("=" * 70)
test_in_top10 = sum(1 for r in results[:10] if r["category"] == "TEST")
src_in_top10 = 10 - test_in_top10
logger.info(f"\nTop 10 results: {src_in_top10} SRC, {test_in_top10} TEST")
if test_in_top10 > src_in_top10:
logger.info("\nPROBLEM: Test files dominate top results")
logger.info("\nPossible causes:")
logger.info(" 1. Test files mention implementation concepts explicitly")
logger.info(" (e.g., docstrings describe what they test)")
logger.info(" 2. Embedding model treats test descriptions as similar to")
logger.info(" implementation descriptions")
logger.info(" 3. Cross-encoder reranker gives higher scores to")
logger.info(" descriptive test content over implementation code")
# Check if coarse candidates already favor tests
test_in_coarse_top30 = sum(1 for c in candidates[:30] if c["is_test"])
if test_in_coarse_top30 > 15:
logger.info(f"\n → Dense coarse search already favors tests")
logger.info(f" ({test_in_coarse_top30}/30 test files in coarse top-30)")
logger.info(f" Problem is at EMBEDDING/DENSE SEARCH stage")
else:
logger.info(f"\n → Coarse search is balanced ({test_in_coarse_top30}/30 tests)")
logger.info(f" Problem is at CROSS-ENCODER RERANKING stage")
if __name__ == "__main__":
main()

Binary file not shown.

Binary file not shown.

View File

@@ -1,171 +0,0 @@
# Chain Search Quick Reference
## Import
```python
from pathlib import Path
from codexlens.search import (
ChainSearchEngine,
SearchOptions,
quick_search
)
from codexlens.storage.registry import RegistryStore
from codexlens.storage.path_mapper import PathMapper
```
## One-Line Search
```python
results = quick_search("query", Path("/path/to/search"), depth=-1)
```
## Full Engine Usage
### 1. Initialize Engine
```python
registry = RegistryStore()
registry.initialize()
mapper = PathMapper()
engine = ChainSearchEngine(registry, mapper)
```
### 2. Configure Search
```python
options = SearchOptions(
depth=-1, # -1 = unlimited, 0 = current dir only
max_workers=8, # Parallel threads
limit_per_dir=10, # Max results per directory
total_limit=100, # Total result limit
include_symbols=False, # Include symbol search
files_only=False # Return only paths
)
```
### 3. Execute Search
```python
result = engine.search("query", Path("/path"), options)
# Access results
for r in result.results:
print(f"{r.path}: score={r.score:.2f}")
print(f" {r.excerpt}")
# Check statistics
print(f"Searched {result.stats.dirs_searched} directories")
print(f"Found {result.stats.files_matched} files")
print(f"Time: {result.stats.time_ms:.2f}ms")
```
### 4. Symbol Search
```python
symbols = engine.search_symbols(
"function_name",
Path("/path"),
kind="function" # Optional: 'function', 'class', 'method', etc.
)
for sym in symbols:
print(f"{sym.name} ({sym.kind}) at lines {sym.range[0]}-{sym.range[1]}")
```
### 5. Files-Only Mode
```python
paths = engine.search_files_only("query", Path("/path"))
for path in paths:
print(path)
```
## SearchOptions Parameters
| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `depth` | int | -1 | Search depth (-1 = unlimited) |
| `max_workers` | int | 8 | Parallel worker threads |
| `limit_per_dir` | int | 10 | Max results per directory |
| `total_limit` | int | 100 | Total result limit |
| `include_symbols` | bool | False | Include symbol search |
| `files_only` | bool | False | Return only file paths |
## SearchResult Fields
| Field | Type | Description |
|-------|------|-------------|
| `path` | str | File path |
| `score` | float | BM25 relevance score |
| `excerpt` | str | Highlighted text snippet |
| `content` | str | Full matched content (optional) |
| `symbol` | Symbol | Matched symbol (optional) |
## SearchStats Fields
| Field | Type | Description |
|-------|------|-------------|
| `dirs_searched` | int | Number of directories searched |
| `files_matched` | int | Number of files with matches |
| `time_ms` | float | Total search time (milliseconds) |
| `errors` | List[str] | Error messages |
## Common Patterns
### Search Current Project
```python
result = engine.search("authentication", Path.cwd())
```
### Limit Depth for Speed
```python
options = SearchOptions(depth=2) # Only 2 levels deep
result = engine.search("TODO", Path("/project"), options)
```
### Find All Implementations
```python
symbols = engine.search_symbols("__init__", Path("/project"), kind="function")
```
### Quick File List
```python
files = engine.search_files_only("config", Path("/project"))
```
### Comprehensive Search
```python
options = SearchOptions(
depth=-1,
total_limit=500,
include_symbols=True
)
result = engine.search("api", Path("/project"), options)
print(f"Files: {len(result.results)}")
print(f"Symbols: {len(result.symbols)}")
```
## Performance Tips
1. **Use depth limits** for faster searches in large codebases
2. **Use files_only** when you don't need excerpts
3. **Reuse ChainSearchEngine** instance for multiple searches
4. **Adjust max_workers** based on CPU cores
5. **Use limit_per_dir** to reduce memory usage
## Error Handling
```python
result = engine.search("query", Path("/path"))
if result.stats.errors:
print("Errors occurred:")
for error in result.stats.errors:
print(f" - {error}")
if not result.results:
print("No results found")
else:
print(f"Found {len(result.results)} results")
```
## Cleanup
```python
registry.close() # Close when done
```

View File

@@ -1,676 +0,0 @@
# Codexlens LSP API 规范
**版本**: 1.1
**状态**: ✅ APPROVED (Gemini Review)
**架构**: codexlens 提供 Python APICCW 实现 MCP 端点
**分析来源**: Gemini (架构评审) + Codex (实现评审)
**最后更新**: 2025-01-17
---
## 一、概述
### 1.1 背景
基于 cclsp MCP 服务器实现的分析,设计 codexlens 的 LSP 搜索方法接口,为 AI 提供代码智能能力。
### 1.2 架构决策
**MCP 端点由 CCW 实现codexlens 只提供 Python API**
```
┌─────────────────────────────────────────────────────────────┐
│ Claude Code │
│ ┌───────────────────────────────────────────────────────┐ │
│ │ MCP Client │ │
│ └───────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌───────────────────────────────────────────────────────┐ │
│ │ CCW MCP Server │ │
│ │ ┌─────────────────────────────────────────────────┐ │ │
│ │ │ MCP Tool Handlers │ │ │
│ │ │ • codexlens_file_context │ │ │
│ │ │ • codexlens_find_definition │ │ │
│ │ │ • codexlens_find_references │ │ │
│ │ │ • codexlens_semantic_search │ │ │
│ │ └──────────────────────┬──────────────────────────┘ │ │
│ └─────────────────────────┼─────────────────────────────┘ │
└────────────────────────────┼────────────────────────────────┘
│ Python API 调用
┌─────────────────────────────────────────────────────────────┐
│ codexlens │
│ ┌───────────────────────────────────────────────────────┐ │
│ │ Public API Layer │ │
│ │ codexlens.api.file_context() │ │
│ │ codexlens.api.find_definition() │ │
│ │ codexlens.api.find_references() │ │
│ │ codexlens.api.semantic_search() │ │
│ └──────────────────────┬────────────────────────────────┘ │
│ │ │
│ ┌──────────────────────▼────────────────────────────────┐ │
│ │ Core Components │ │
│ │ GlobalSymbolIndex | ChainSearchEngine | HoverProvider │ │
│ └───────────────────────────────────────────────────────┘ │
│ │ │
│ ┌──────────────────────▼────────────────────────────────┐ │
│ │ SQLite Index Databases │ │
│ │ global_symbols.db | *.index.db (per-directory) │ │
│ └───────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────┘
```
### 1.3 职责分离
| 组件 | 职责 |
|------|------|
| **codexlens** | Python API、索引查询、搜索算法、结果聚合、降级处理 |
| **CCW** | MCP 协议、参数校验、结果序列化、错误处理、project_root 推断 |
### 1.4 codexlens vs cclsp 对比
| 特性 | cclsp | codexlens |
|------|-------|-----------|
| 数据源 | 实时 LSP 服务器 | 预建 SQLite 索引 |
| 启动时间 | 200-3000ms | <50ms |
| 响应时间 | 50-500ms | <5ms |
| 跨语言 | 每语言需要 LSP 服务器 | 统一 Python/TS/JS/Go 索引 |
| 依赖 | 需要语言服务器 | 无外部依赖 |
| 准确度 | 100% (编译器级) | 95%+ (tree-sitter) |
| 重命名支持 | 是 | 否 (只读索引) |
| 实时诊断 | 是 | 通过 IDE MCP |
**推荐**: codexlens 用于快速搜索cclsp 用于精确重构
---
## 二、cclsp 设计模式 (参考)
### 2.1 MCP 工具接口设计
| 模式 | 说明 | 代码位置 |
|------|------|----------|
| **基于名称** | 接受 `symbol_name` 而非文件坐标 | `index.ts:70` |
| **安全消歧义** | `rename_symbol``rename_symbol_strict` 两步 | `index.ts:133, 172` |
| **复杂性抽象** | 隐藏 LSP 协议细节 | `index.ts:211` |
| **优雅失败** | 返回有用的文本响应 | 全局 |
### 2.2 符号解析算法
```
1. getDocumentSymbols (lsp-client.ts:1406)
└─ 获取文件所有符号
2. 处理两种格式:
├─ DocumentSymbol[] → 扁平化
└─ SymbolInformation[] → 二次定位
3. 过滤: symbol.name === symbolName && symbol.kind
4. 回退: 无结果时移除 kind 约束重试
5. 聚合: 遍历所有匹配,聚合定义位置
```
---
## 三、需求规格
### 需求 1: 文件上下文查询 (`file_context`)
**用途**: 读取代码文件,返回文件中所有方法的调用关系摘要
**输出示例**:
```markdown
## src/auth/login.py (3 methods)
### login_user (line 15-45)
- Calls: validate_password (auth/utils.py:23), create_session (session/manager.py:89)
- Called by: handle_login_request (api/routes.py:156), test_login (tests/test_auth.py:34)
### validate_token (line 47-62)
- Calls: decode_jwt (auth/jwt.py:12)
- Called by: auth_middleware (middleware/auth.py:28)
```
### 需求 2: 通用 LSP 搜索 (cclsp 兼容)
| 端点 | 用途 |
|------|------|
| `find_definition` | 根据符号名查找定义位置 |
| `find_references` | 查找符号的所有引用 |
| `workspace_symbols` | 工作区符号搜索 |
| `get_hover` | 获取符号悬停信息 |
### 需求 3: 向量 + LSP 融合搜索
**用途**: 结合向量语义搜索和结构化 LSP 搜索
**融合策略**:
- **RRF** (首选): 简单、不需要分数归一化、鲁棒
- **Cascade**: 特定场景,先向量后 LSP
- **Adaptive**: 长期目标,按查询类型自动选择
---
## 四、API 规范
### 4.1 模块结构
```
src/codexlens/
├─ api/ [新增] 公开 API 层
│ ├─ __init__.py 导出所有 API
│ ├─ file_context.py 文件上下文
│ ├─ definition.py 定义查找
│ ├─ references.py 引用查找
│ ├─ symbols.py 符号搜索
│ ├─ hover.py 悬停信息
│ └─ semantic.py 语义搜索
├─ storage/
│ ├─ global_index.py [扩展] get_file_symbols()
│ └─ relationship_query.py [新增] 有向调用查询
└─ search/
└─ chain_search.py [修复] schema 兼容
```
### 4.2 `codexlens.api.file_context()`
```python
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Tuple
@dataclass
class CallInfo:
"""调用关系信息"""
symbol_name: str
file_path: Optional[str] # 目标文件 (可能为 None)
line: int
relationship: str # call | import | inheritance
@dataclass
class MethodContext:
"""方法上下文"""
name: str
kind: str # function | method | class
line_range: Tuple[int, int]
signature: Optional[str]
calls: List[CallInfo] # 出向调用
callers: List[CallInfo] # 入向调用
@dataclass
class FileContextResult:
"""文件上下文结果"""
file_path: str
language: str
methods: List[MethodContext]
summary: str # 人类可读摘要
discovery_status: Dict[str, bool] = field(default_factory=lambda: {
"outgoing_resolved": False,
"incoming_resolved": True,
"targets_resolved": False
})
def file_context(
project_root: str,
file_path: str,
include_calls: bool = True,
include_callers: bool = True,
max_depth: int = 1,
format: str = "brief" # brief | detailed | tree
) -> FileContextResult:
"""
获取代码文件的方法调用上下文。
Args:
project_root: 项目根目录 (用于定位索引)
file_path: 代码文件路径
include_calls: 是否包含出向调用
include_callers: 是否包含入向调用
max_depth: 调用链深度 (1=直接调用)
⚠️ V1 限制: 当前版本仅支持 max_depth=1
深度调用链分析将在 V2 实现
format: 输出格式
Returns:
FileContextResult
Raises:
IndexNotFoundError: 项目未索引
FileNotFoundError: 文件不存在
Note:
V1 实现限制:
- max_depth 仅支持 1 (直接调用)
- 出向调用目标文件可能为 None (未解析)
- 深度调用链分析作为 V2 特性规划
"""
```
### 4.3 `codexlens.api.find_definition()`
```python
@dataclass
class DefinitionResult:
"""定义查找结果"""
name: str
kind: str
file_path: str
line: int
end_line: int
signature: Optional[str]
container: Optional[str] # 所属类/模块
score: float
def find_definition(
project_root: str,
symbol_name: str,
symbol_kind: Optional[str] = None,
file_context: Optional[str] = None,
limit: int = 10
) -> List[DefinitionResult]:
"""
根据符号名称查找定义位置。
Fallback 策略:
1. 精确匹配 + kind 过滤
2. 精确匹配 (移除 kind)
3. 前缀匹配
"""
```
### 4.4 `codexlens.api.find_references()`
```python
@dataclass
class ReferenceResult:
"""引用结果"""
file_path: str
line: int
column: int
context_line: str
relationship: str # call | import | type_annotation | inheritance
@dataclass
class GroupedReferences:
"""按定义分组的引用"""
definition: DefinitionResult
references: List[ReferenceResult]
def find_references(
project_root: str,
symbol_name: str,
symbol_kind: Optional[str] = None,
include_definition: bool = True,
group_by_definition: bool = True,
limit: int = 100
) -> List[GroupedReferences]:
"""
查找符号的所有引用位置。
多定义时分组返回,解决引用混淆问题。
"""
```
### 4.5 `codexlens.api.workspace_symbols()`
```python
@dataclass
class SymbolInfo:
"""符号信息"""
name: str
kind: str
file_path: str
line: int
container: Optional[str]
score: float
def workspace_symbols(
project_root: str,
query: str,
kind_filter: Optional[List[str]] = None,
file_pattern: Optional[str] = None,
limit: int = 50
) -> List[SymbolInfo]:
"""在整个工作区搜索符号 (前缀匹配)。"""
```
### 4.6 `codexlens.api.get_hover()`
```python
@dataclass
class HoverInfo:
"""悬停信息"""
name: str
kind: str
signature: str
documentation: Optional[str]
file_path: str
line_range: Tuple[int, int]
type_info: Optional[str]
def get_hover(
project_root: str,
symbol_name: str,
file_path: Optional[str] = None
) -> Optional[HoverInfo]:
"""获取符号的详细悬停信息。"""
```
### 4.7 `codexlens.api.semantic_search()`
```python
@dataclass
class SemanticResult:
"""语义搜索结果"""
symbol_name: str
kind: str
file_path: str
line: int
vector_score: Optional[float]
structural_score: Optional[float]
fusion_score: float
snippet: str
match_reason: Optional[str]
def semantic_search(
project_root: str,
query: str,
mode: str = "fusion", # vector | structural | fusion
vector_weight: float = 0.5,
structural_weight: float = 0.3,
keyword_weight: float = 0.2,
fusion_strategy: str = "rrf", # rrf | staged | binary | hybrid
kind_filter: Optional[List[str]] = None,
limit: int = 20,
include_match_reason: bool = False
) -> List[SemanticResult]:
"""
语义搜索 - 结合向量和结构化搜索。
Args:
project_root: 项目根目录
query: 自然语言查询
mode: 搜索模式
- vector: 仅向量搜索
- structural: 仅结构搜索 (符号 + 关系)
- fusion: 融合搜索 (默认)
vector_weight: 向量搜索权重 [0, 1]
structural_weight: 结构搜索权重 [0, 1]
keyword_weight: 关键词搜索权重 [0, 1]
fusion_strategy: 融合策略 (映射到 chain_search.py)
- rrf: Reciprocal Rank Fusion (推荐,默认)
- staged: 分阶段级联 → staged_cascade_search
- binary: 二分重排级联 → binary_rerank_cascade_search
- hybrid: 混合级联 → hybrid_search
kind_filter: 符号类型过滤
limit: 最大返回数量
include_match_reason: 是否生成匹配原因 (启发式,非 LLM)
Returns:
按 fusion_score 排序的结果列表
降级行为:
- 无向量索引: vector_score=None, 使用 FTS + 结构搜索
- 无关系数据: structural_score=None, 仅向量搜索
"""
```
---
## 五、已知问题与解决方案
### 5.1 P0 阻塞项
| 问题 | 位置 | 解决方案 |
|------|------|----------|
| **索引 Schema 不匹配** | `chain_search.py:313-324` vs `dir_index.py:304-312` | 兼容 `full_path``path` |
| **文件符号查询缺失** | `global_index.py:214-260` | 新增 `get_file_symbols()` |
| **出向调用查询缺失** | `dir_index.py:333-342` | 新增 `RelationshipQuery` |
| **关系类型不一致** | `entities.py:74-79` | 规范化 `calls``call` |
### 5.2 设计缺陷 (Gemini 发现)
| 缺陷 | 影响 | 解决方案 |
|------|------|----------|
| **调用图不完整** | `file_context` 缺少出向调用 | 新增有向调用 API |
| **消歧义未定义** | 多定义时无法区分 | 实现 `rank_by_proximity()` |
| **AI 特性成本过高** | `explanation` 需要 LLM | 设为可选,默认关闭 |
| **融合参数不一致** | 3 分支但只有 2 权重 | 补充 `keyword_weight` |
### 5.3 消歧义算法
**V1 实现** (基于文件路径接近度):
```python
def rank_by_proximity(
results: List[DefinitionResult],
file_context: str
) -> List[DefinitionResult]:
"""按文件接近度排序 (V1: 路径接近度)"""
def proximity_score(result):
# 1. 同目录最高分
if os.path.dirname(result.file_path) == os.path.dirname(file_context):
return 100
# 2. 共同路径前缀长度
common = os.path.commonpath([result.file_path, file_context])
return len(common)
return sorted(results, key=proximity_score, reverse=True)
```
**V2 增强计划** (基于 import graph 距离):
```python
def rank_by_import_distance(
results: List[DefinitionResult],
file_context: str,
import_graph: Dict[str, Set[str]]
) -> List[DefinitionResult]:
"""按 import graph 距离排序 (V2)"""
def import_distance(result):
# BFS 计算最短 import 路径
return bfs_shortest_path(
import_graph,
file_context,
result.file_path
)
# 组合: 0.6 * import_distance + 0.4 * path_proximity
return sorted(results, key=lambda r: (
0.6 * import_distance(r) +
0.4 * (100 - proximity_score(r))
))
```
### 5.4 参考实现: `get_file_symbols()`
**位置**: `src/codexlens/storage/global_index.py`
```python
def get_file_symbols(self, file_path: str | Path) -> List[Symbol]:
"""
获取指定文件中定义的所有符号。
Args:
file_path: 文件路径 (相对或绝对)
Returns:
按行号排序的符号列表
"""
file_path_str = str(Path(file_path).resolve())
with self._lock:
conn = self._get_connection()
rows = conn.execute(
"""
SELECT symbol_name, symbol_kind, file_path, start_line, end_line
FROM global_symbols
WHERE project_id = ? AND file_path = ?
ORDER BY start_line
""",
(self.project_id, file_path_str),
).fetchall()
return [
Symbol(
name=row["symbol_name"],
kind=row["symbol_kind"],
range=(row["start_line"], row["end_line"]),
file=row["file_path"],
)
for row in rows
]
```
---
## 六、实现计划
### Phase 0: 基础设施 (16h)
| 任务 | 工时 | 说明 |
|------|------|------|
| 修复 `search_references` schema | 4h | 兼容两种 schema |
| 新增 `GlobalSymbolIndex.get_file_symbols()` | 4h | 文件符号查询 (见 5.4) |
| 新增 `RelationshipQuery` 类 | 6h | 有向调用查询 |
| 关系类型规范化层 | 2h | `calls``call` |
### Phase 1: API 层 (48h)
| 任务 | 工时 | 复杂度 |
|------|------|--------|
| `find_definition()` | 4h | S |
| `find_references()` | 8h | M |
| `workspace_symbols()` | 4h | S |
| `get_hover()` | 4h | S |
| `file_context()` | 16h | L |
| `semantic_search()` | 12h | M |
### Phase 2: 测试与文档 (16h)
| 任务 | 工时 |
|------|------|
| 单元测试 (≥80%) | 8h |
| API 文档 | 4h |
| 示例代码 | 4h |
### 关键路径
```
Phase 0.1 (schema fix)
Phase 0.2 (file symbols) → Phase 1.5 (file_context)
Phase 1 (其他 API)
Phase 2 (测试)
```
---
## 七、测试策略
### 7.1 单元测试
```python
# test_global_index.py
def test_get_file_symbols():
index = GlobalSymbolIndex(":memory:")
index.update_file_symbols(project_id=1, file_path="test.py", symbols=[...])
results = index.get_file_symbols("test.py")
assert len(results) == 3
# test_relationship_query.py
def test_outgoing_calls():
store = DirIndexStore(":memory:")
calls = store.get_outgoing_calls("src/auth.py", "login")
assert calls[0].relationship == "call" # 已规范化
```
### 7.2 Schema 兼容性测试
```python
def test_search_references_both_schemas():
"""测试两种 schema 的引用搜索"""
# 旧 schema: files(path, ...)
# 新 schema: files(full_path, ...)
```
### 7.3 降级测试
```python
def test_semantic_search_without_vectors():
result = semantic_search(query="auth", mode="fusion")
assert result.vector_score is None
assert result.fusion_score > 0
```
---
## 八、使用示例
```python
from codexlens.api import (
file_context,
find_definition,
find_references,
semantic_search
)
# 1. 获取文件上下文
result = file_context(
project_root="/path/to/project",
file_path="src/auth/login.py",
format="brief"
)
print(result.summary)
# 2. 查找定义
definitions = find_definition(
project_root="/path/to/project",
symbol_name="UserService",
symbol_kind="class"
)
# 3. 语义搜索
results = semantic_search(
project_root="/path/to/project",
query="处理用户登录验证的函数",
mode="fusion"
)
```
---
## 九、CCW 集成
| codexlens API | CCW MCP Tool |
|---------------|--------------|
| `file_context()` | `codexlens_file_context` |
| `find_definition()` | `codexlens_find_definition` |
| `find_references()` | `codexlens_find_references` |
| `workspace_symbols()` | `codexlens_workspace_symbol` |
| `get_hover()` | `codexlens_get_hover` |
| `semantic_search()` | `codexlens_semantic_search` |
---
## 十、分析来源
| 工具 | Session ID | 贡献 |
|------|------------|------|
| Gemini | `1768618654438-gemini` | 架构评审、设计缺陷、融合策略 |
| Codex | `1768618658183-codex` | 组件复用、复杂度估算、任务分解 |
| Gemini | `1768620615744-gemini` | 最终评审、改进建议、APPROVED |
---
## 十一、版本历史
| 版本 | 日期 | 变更 |
|------|------|------|
| 1.0 | 2025-01-17 | 初始版本,合并多文档 |
| 1.1 | 2025-01-17 | 应用 Gemini 评审改进: V1 限制说明、策略映射、消歧义增强、参考实现 |

View File

@@ -1,326 +0,0 @@
# CodexLens Auto Hybrid Mode - Implementation Summary
## 概述
实现了两个主要功能:
1. **自动向量嵌入生成**`init` 命令在检测到语义搜索依赖后自动生成向量嵌入
2. **默认混合搜索模式**`search` 命令在检测到嵌入存在时自动使用 hybrid 模式
## 修改文件
### 1. codex-lens CLI (`codex-lens/src/codexlens/cli/commands.py`)
#### 1.1 `init` 命令增强
**新增参数**
- `--no-embeddings`: 跳过自动嵌入生成
- `--embedding-model`: 指定嵌入模型 (默认: "code")
**自动嵌入生成逻辑**
```python
# 在 init 成功后
if not no_embeddings:
from codexlens.semantic import SEMANTIC_AVAILABLE
if SEMANTIC_AVAILABLE:
# 自动调用 generate_embeddings()
# 使用指定的 embedding_model
```
**行为**
- 检测 `fastembed``numpy` 是否安装
- 如果可用,自动生成嵌入(可用 `--no-embeddings` 跳过)
- 默认使用 "code" 模型 (jinaai/jina-embeddings-v2-base-code)
- 在输出中显示嵌入生成进度和统计
#### 1.2 `search` 命令增强
**模式变更**
- 默认模式从 `"exact"` 改为 `"auto"`
- 新增 `"auto"` 模式到有效模式列表
**自动模式检测逻辑**
```python
if mode == "auto":
# 检查项目是否有嵌入
project_record = registry.find_by_source_path(str(search_path))
if project_record:
embed_status = check_embeddings_status(index_path)
if has_embeddings:
actual_mode = "hybrid" # 使用混合模式
else:
actual_mode = "exact" # 降级到精确模式
```
**行为**
- 默认使用 `auto` 模式
- 自动检测索引是否有嵌入
- 有嵌入 → 使用 `hybrid` 模式(精确 + 模糊 + 向量融合)
- 无嵌入 → 使用 `exact` 模式(仅全文搜索)
- 用户仍可手动指定模式覆盖自动检测
### 2. MCP 工具简化 (`ccw/src/tools/codex-lens.ts`)
#### 2.1 简化 action 枚举
**仅暴露核心操作**
- `init`: 初始化索引(自动生成嵌入)
- `search`: 搜索代码(自动混合模式)
- `search_files`: 搜索文件路径
**移除的高级操作**(仍可通过 CLI 使用):
- ~~`symbol`~~: 符号提取 → 使用 `codexlens symbol`
- ~~`status`~~: 状态检查 → 使用 `codexlens status`
- ~~`config_show/set/migrate`~~: 配置管理 → 使用 `codexlens config`
- ~~`clean`~~: 清理索引 → 使用 `codexlens clean`
- ~~`bootstrap/check`~~: 安装管理 → 自动处理
**简化的 ParamsSchema**
```typescript
const ParamsSchema = z.object({
action: z.enum(['init', 'search', 'search_files']),
path: z.string().optional(),
query: z.string().optional(),
mode: z.enum(['auto', 'text', 'semantic', 'exact', 'fuzzy', 'hybrid', 'vector', 'pure-vector']).default('auto'),
languages: z.array(z.string()).optional(),
limit: z.number().default(20),
});
```
#### 2.2 扩展 mode 枚举并设置默认值
**模式支持**
```typescript
mode: z.enum(['auto', 'text', 'semantic', 'exact', 'fuzzy', 'hybrid', 'vector', 'pure-vector']).default('auto')
```
**模式映射**MCP → CLI
```typescript
const modeMap: Record<string, string> = {
'text': 'exact',
'semantic': 'pure-vector',
'auto': 'auto', // 默认:自动检测
'exact': 'exact',
'fuzzy': 'fuzzy',
'hybrid': 'hybrid',
'vector': 'vector',
'pure-vector': 'pure-vector',
};
```
#### 2.3 传递 mode 参数到 CLI
```typescript
const args = ['search', query, '--limit', limit.toString(), '--mode', cliMode, '--json'];
```
### 3. 文档更新 (`.claude/rules/context-requirements.md`)
#### 3.1 更新 init 说明
强调自动嵌入生成功能:
```markdown
**NEW**: `init` automatically generates vector embeddings if semantic dependencies are installed (fastembed).
- Auto-detects if `numpy` and `fastembed` are available
- Uses "code" model by default (jinaai/jina-embeddings-v2-base-code)
- Skip with `--no-embeddings` flag if needed
```
#### 3.2 更新 search 说明
强调自动混合模式:
```markdown
**Search Code** (Auto Hybrid Mode - DEFAULT):
# Simple call - auto-detects mode (hybrid if embeddings exist, exact otherwise):
codex_lens(action="search", query="authentication", path=".", limit=20)
```
#### 3.3 详细模式说明
添加完整的模式列表和默认行为说明:
- `auto`: **DEFAULT** - Uses hybrid if embeddings exist, exact otherwise
- `hybrid`: Exact + Fuzzy + Vector fusion (best results, auto-selected if embeddings exist)
- 其他模式...
## 使用示例
### 场景 1首次使用已安装 fastembed
```bash
# 初始化索引(自动生成嵌入)
codexlens init .
# 输出:
# OK Indexed 150 files in 12 directories
#
# Generating embeddings...
# Model: code
# ✓ Generated 1234 embeddings in 45.2s
# 搜索(自动使用 hybrid 模式)
codexlens search "authentication"
# Mode: hybrid | Searched 12 directories in 15.2ms
```
### 场景 2首次使用未安装 fastembed
```bash
# 初始化索引(跳过嵌入)
codexlens init .
# 输出:
# OK Indexed 150 files in 12 directories
# (无嵌入生成提示)
# 搜索(降级到 exact 模式)
codexlens search "authentication"
# Mode: exact | Searched 12 directories in 8.5ms
```
### 场景 3手动控制
```bash
# 跳过嵌入生成
codexlens init . --no-embeddings
# 强制使用特定模式
codexlens search "auth" --mode exact
codexlens search "how to authenticate" --mode hybrid
```
### 场景 4MCP 工具使用(简化版)
```python
# 初始化(自动生成嵌入)
codex_lens(action="init", path=".")
# 搜索(默认 auto 模式:有嵌入用 hybrid无嵌入用 exact
codex_lens(action="search", query="authentication")
# 强制混合模式
codex_lens(action="search", query="authentication", mode="hybrid")
# 强制精确模式
codex_lens(action="search", query="authenticate_user", mode="exact")
# 仅返回文件路径
codex_lens(action="search_files", query="payment processing")
```
**高级操作使用 CLI**
```bash
# 检查状态
codexlens status
# 提取符号
codexlens symbol src/auth/login.js
# 配置管理
codexlens config show
codexlens config set index_dir /custom/path
# 清理索引
codexlens clean .
```
## 技术细节
### 嵌入检测逻辑
1. 查找项目在 registry 中的记录
2. 获取索引路径 `index_root/_index.db`
3. 调用 `check_embeddings_status()` 检查:
- 是否存在 `chunks`
- `chunks_count > 0`
4. 根据检测结果选择模式
### 混合搜索权重
默认 RRF 权重:
- Exact FTS: 0.4
- Fuzzy FTS: 0.3
- Vector: 0.3
可通过 `--weights` 参数自定义:
```bash
codexlens search "query" --mode hybrid --weights 0.5,0.3,0.2
```
### 模型选项
| 模型 | 模型名称 | 维度 | 大小 | 推荐场景 |
|------|---------|------|------|---------|
| fast | BAAI/bge-small-en-v1.5 | 384 | ~80MB | 快速原型 |
| code | jinaai/jina-embeddings-v2-base-code | 768 | ~150MB | **推荐** 代码搜索 |
| multilingual | intfloat/multilingual-e5-large | 1024 | ~1GB | 多语言项目 |
| balanced | mixedbread-ai/mxbai-embed-large-v1 | 1024 | ~600MB | 平衡性能 |
## 兼容性
### 向后兼容
- 所有现有命令仍然工作
- 手动指定 `--mode` 会覆盖自动检测
- 使用 `--no-embeddings` 可恢复旧行为
### 依赖要求
**核心功能**(无需额外依赖):
- FTS 索引exact, fuzzy
- 符号提取
**语义搜索功能**(需要安装):
```bash
pip install codexlens[semantic]
# 或
pip install numpy fastembed
```
## 性能影响
### 初始化时间
- FTS 索引:~2-5 秒100 文件)
- 嵌入生成:+30-60 秒(首次下载模型)
- 后续嵌入:+10-20 秒
### 搜索性能
| 模式 | 延迟 | 召回率 | 推荐场景 |
|------|------|--------|---------|
| exact | 5ms | 中 | 精确代码标识符 |
| fuzzy | 7ms | 中 | 容错搜索 |
| hybrid | 15ms | **最高** | **通用搜索(推荐)** |
| vector | 12ms | 高 | 语义查询 |
| pure-vector | 10ms | 中 | 自然语言 |
## 最小化修改原则
所有修改都遵循最小化原则:
1. **保持向后兼容**:不破坏现有功能
2. **默认智能**:自动检测最佳模式
3. **用户可控**:可通过参数覆盖自动行为
4. **渐进增强**:未安装 fastembed 时优雅降级
## 总结
**init 命令自动生成嵌入**(可用 `--no-embeddings` 跳过)
**search 命令默认使用混合模式**(有嵌入时自动启用)
**MCP 工具简化为核心操作**init, search, search_files
**所有搜索模式支持**auto, exact, fuzzy, hybrid, vector, pure-vector
✅ **文档已更新**反映新的默认行为
**保持向后兼容性**
**优雅降级**(无 fastembed 时使用 exact 模式)
### MCP vs CLI 功能对比
| 功能 | MCP 工具 | CLI |
|------|---------|-----|
| 初始化索引 | ✅ `codex_lens(action="init")` | ✅ `codexlens init` |
| 搜索代码 | ✅ `codex_lens(action="search")` | ✅ `codexlens search` |
| 搜索文件 | ✅ `codex_lens(action="search_files")` | ✅ `codexlens search --files-only` |
| 检查状态 | ❌ 使用 CLI | ✅ `codexlens status` |
| 提取符号 | ❌ 使用 CLI | ✅ `codexlens symbol` |
| 配置管理 | ❌ 使用 CLI | ✅ `codexlens config` |
| 清理索引 | ❌ 使用 CLI | ✅ `codexlens clean` |
**设计理念**MCP 工具专注于高频核心操作(索引、搜索),高级管理操作通过 CLI 执行。

View File

@@ -1,298 +0,0 @@
# CodexLens 配置说明
## 目录结构
```
~/.codexlens/ # 全局数据目录
├── .env # 全局 API 配置 (新增)
├── settings.json # 运行时设置
├── embedding_lock.json # 模型锁定文件
├── registry.db # 项目注册表
├── indexes/ # 集中式索引存储
└── venv/ # Python 虚拟环境
project/
├── .codexlens/ # 工作区本地目录
│ ├── .env # 工作区 API 配置 (覆盖全局)
│ ├── index.db # 项目索引数据库
│ ├── cache/ # 缓存目录
│ └── .gitignore # 排除敏感文件
└── .env # 项目根目录配置
```
## 配置优先级
配置加载顺序 (后者覆盖前者):
| 优先级 | 位置 | 说明 |
|--------|------|------|
| 1 (最低) | `~/.codexlens/.env` | 全局默认配置 |
| 2 | `project/.env` | 项目根目录配置 |
| 3 | `project/.codexlens/.env` | 工作区本地配置 |
| 4 (最高) | 环境变量 | Shell 环境变量 |
## 环境变量
### Embedding 配置
用于 `litellm` 后端的嵌入向量服务:
```bash
# API 密钥
EMBEDDING_API_KEY=your-api-key
# API 基础 URL
EMBEDDING_API_BASE=https://api.example.com/v1
# 嵌入模型名称
EMBEDDING_MODEL=text-embedding-3-small
```
**支持的提供商示例**:
| 提供商 | API Base | 模型示例 |
|--------|----------|----------|
| OpenAI | `https://api.openai.com/v1` | `text-embedding-3-small` |
| ModelScope | `https://api-inference.modelscope.cn/v1` | `Qwen/Qwen3-Embedding-8B` |
| Azure | `https://your-resource.openai.azure.com` | `text-embedding-ada-002` |
### LiteLLM 配置
用于 LLM 功能 (重排序、语义分析等):
```bash
# API 密钥
LITELLM_API_KEY=your-api-key
# API 基础 URL
LITELLM_API_BASE=https://api.example.com/v1
# 模型名称
LITELLM_MODEL=gpt-4o-mini
```
### Reranker 配置
用于搜索结果重排序 (可选):
```bash
# API 密钥
RERANKER_API_KEY=your-api-key
# API 基础 URL
RERANKER_API_BASE=https://api.siliconflow.cn
# 提供商: siliconflow, cohere, jina
RERANKER_PROVIDER=siliconflow
# 重排序模型
RERANKER_MODEL=BAAI/bge-reranker-v2-m3
```
### 通用配置
```bash
# 自定义数据目录 (默认: ~/.codexlens)
CODEXLENS_DATA_DIR=~/.codexlens
# 启用调试模式
CODEXLENS_DEBUG=false
```
## settings.json
运行时设置保存在 `~/.codexlens/settings.json`:
```json
{
"embedding": {
"backend": "litellm",
"model": "Qwen/Qwen3-Embedding-8B",
"use_gpu": false,
"endpoints": [
{
"model": "Qwen/Qwen3-Embedding-8B",
"api_key": "${EMBEDDING_API_KEY}",
"api_base": "${EMBEDDING_API_BASE}",
"weight": 1.0
}
],
"strategy": "latency_aware",
"cooldown": 60.0
},
"llm": {
"enabled": true,
"tool": "gemini",
"timeout_ms": 300000,
"batch_size": 5
},
"parsing": {
"use_astgrep": false
},
"indexing": {
"static_graph_enabled": false,
"static_graph_relationship_types": ["imports", "inherits"]
}
}
```
### Embedding 设置
| 字段 | 类型 | 说明 |
|------|------|------|
| `backend` | string | `fastembed` (本地) 或 `litellm` (API) |
| `model` | string | 模型名称或配置文件 |
| `use_gpu` | bool | GPU 加速 (仅 fastembed) |
| `endpoints` | array | 多端点配置 (仅 litellm) |
| `strategy` | string | 负载均衡策略 |
| `cooldown` | float | 限流冷却时间 (秒) |
**Embedding Backend 对比**:
| 特性 | fastembed | litellm |
|------|-----------|---------|
| 运行方式 | 本地 ONNX | API 调用 |
| 依赖 | 本地模型文件 | API 密钥 |
| 速度 | 快 (本地) | 取决于网络 |
| 模型选择 | 预定义配置文件 | 任意 API 模型 |
| GPU 支持 | 是 | N/A |
**负载均衡策略**:
| 策略 | 说明 |
|------|------|
| `round_robin` | 轮询分配 |
| `latency_aware` | 延迟感知 (推荐) |
| `weighted_random` | 加权随机 |
### LLM 设置
| 字段 | 类型 | 说明 |
|------|------|------|
| `enabled` | bool | 启用 LLM 功能 |
| `tool` | string | LLM 工具 (`gemini`, `codex`) |
| `timeout_ms` | int | 超时时间 (毫秒) |
| `batch_size` | int | 批处理大小 |
### Parsing 设置
| 字段 | 类型 | 说明 |
|------|------|------|
| `use_astgrep` | bool | 优先使用 ast-grep 解析关系(实验性;当前主要用于 Python relationships |
### Indexing 设置(静态图)
| 字段 | 类型 | 说明 |
|------|------|------|
| `static_graph_enabled` | bool | 索引时将 relationships 写入全局 `global_relationships`,用于搜索阶段静态图扩展 |
| `static_graph_relationship_types` | array | 允许持久化的关系类型:`imports` / `inherits` / `calls` |
**CLI 覆盖(单次运行,不写入 settings.json**:
```bash
# 索引时启用静态图 relationships + 使用 ast-grep如果可用
codexlens index init --use-astgrep --static-graph --static-graph-types imports,inherits,calls
```
**Search staged 静态图扩展(高级)**:
```bash
codexlens search --cascade-strategy staged --staged-stage2-mode static_global_graph
```
## FastEmbed 模型配置文件
使用 `fastembed` 后端时的预定义模型:
| 配置文件 | 模型 | 维度 | 大小 |
|----------|------|------|------|
| `fast` | BAAI/bge-small-en-v1.5 | 384 | 80MB |
| `base` | BAAI/bge-base-en-v1.5 | 768 | 220MB |
| `code` | jinaai/jina-embeddings-v2-base-code | 768 | 150MB |
| `minilm` | sentence-transformers/all-MiniLM-L6-v2 | 384 | 90MB |
| `multilingual` | intfloat/multilingual-e5-large | 1024 | 1000MB |
| `balanced` | mixedbread-ai/mxbai-embed-large-v1 | 1024 | 600MB |
## 快速开始
### 1. 使用全局配置
创建 `~/.codexlens/.env`:
```bash
# 复制示例配置
cp codex-lens/.env.example ~/.codexlens/.env
# 编辑配置
nano ~/.codexlens/.env
```
### 2. 使用本地嵌入 (fastembed)
```bash
# 初始化索引 (使用 code 配置文件)
codexlens init --backend fastembed --model code
# 或使用多语言模型
codexlens init --backend fastembed --model multilingual
```
### 3. 使用 API 嵌入 (litellm)
```bash
# 设置环境变量
export EMBEDDING_API_KEY=your-key
export EMBEDDING_API_BASE=https://api.example.com/v1
export EMBEDDING_MODEL=text-embedding-3-small
# 初始化索引
codexlens init --backend litellm --model text-embedding-3-small
```
### 4. 验证配置
```bash
# 检查配置加载
codexlens config show
# 测试嵌入
codexlens test-embedding "Hello World"
```
## 故障排除
### 配置未加载
检查文件权限和路径:
```bash
ls -la ~/.codexlens/.env
cat ~/.codexlens/.env
```
### API 错误
1. 验证 API 密钥有效性
2. 检查 API Base URL 是否正确
3. 确认模型名称匹配提供商支持的模型
### 模型不兼容
如果更换嵌入模型,需要重建索引:
```bash
# 删除旧索引
rm -rf project/.codexlens/
# 重新初始化
codexlens init --backend litellm --model new-model
```
## 相关文件
| 文件 | 说明 |
|------|------|
| `src/codexlens/config.py` | 配置类定义 |
| `src/codexlens/env_config.py` | 环境变量加载 |
| `src/codexlens/cli/model_manager.py` | FastEmbed 模型管理 |
| `src/codexlens/semantic/factory.py` | Embedder 工厂 |

File diff suppressed because it is too large Load Diff

View File

@@ -1,540 +0,0 @@
# Hybrid Search Architecture for CodexLens
> Embedding + Real-time LSP + Clustering + Reranking Pipeline
## Overview
This document describes the architecture for a hybrid intelligent code search system that combines:
1. **Low-dimensional embedding model** for semantic search
2. **Real-time LSP integration** for code structure analysis
3. **Graph-based clustering** for result organization
4. **Multi-factor reranking** for intelligent sorting
**Key Constraint**: Must use real-time LSP servers, NOT pre-indexed data.
## Architecture Diagram
```
┌─────────────────────────────────────────────────────────────────────────────┐
│ HybridSearchEngine │
│ ┌─────────────────────────────────────────────────────────────────────┐ │
│ │ 5-Stage Search Pipeline │ │
│ │ │ │
│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌────┐│ │
│ │ │ Stage 1 │──▶│ Stage 2 │──▶│ Stage 3 │──▶│ Stage 4 │──▶│ S5 ││ │
│ │ │ Vector │ │ LSP │ │ Graph │ │Clustering│ │Rank││ │
│ │ │ Search │ │Expansion │ │ Building │ │ +Filter │ │ ││ │
│ │ └──────────┘ └──────────┘ └──────────┘ └──────────┘ └────┘│ │
│ └─────────────────────────────────────────────────────────────────────┘ │
│ │
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────────────────┐ │
│ │VectorSearchSvc │ │ LspBridge │ │ GraphBuilder │ │
│ │ │ │ │ │ │ │
│ │ • Embedding │ │ • get_refs() │ │ • build_from_seeds() │ │
│ │ • FAISS/HNSW │ │ • get_def() │ │ • add_relationships() │ │
│ │ • search() │ │ • get_calls() │ │ • CodeAssociationGraph │ │
│ └────────┬────────┘ └────────┬────────┘ └─────────────────────────────┘ │
│ │ │ │
└───────────┼────────────────────┼────────────────────────────────────────────┘
│ │
▼ ▼
┌───────────────┐ ┌───────────────────────────────────────┐
│ Embedding │ │ LanguageServerMultiplexer │
│ Model (local) │ │ (from REAL_LSP_SERVER_PLAN.md) │
│ │ │ │
│ sentence- │ │ ┌─────┐ ┌─────┐ ┌─────┐ ┌──────────┐│
│ transformers │ │ │pylsp│ │gopls│ │tssvr│ │rust-anlzr││
│ │ │ └─────┘ └─────┘ └─────┘ └──────────┘│
└───────────────┘ └───────────────────────────────────────┘
```
## Core Components
### 1. HybridSearchEngine (`hybrid_search/engine.py`)
**Role**: Main orchestrator coordinating all services
```python
class HybridSearchEngine:
def __init__(self):
self.vector_service: VectorSearchService
self.lsp_bridge: LspBridge
self.graph_builder: GraphBuilder
self.clustering_service: ClusteringService
self.ranking_service: RankingService
async def search(self, query: str, top_k: int = 10) -> List[SearchResultCluster]:
# Stage 1: Vector search for seeds
seeds = await self.vector_service.search(query, top_k=top_k * 2)
# Stage 2-3: LSP expansion + Graph building
graph = await self.graph_builder.build_from_seeds(seeds, self.lsp_bridge)
# Stage 4: Clustering + Filtering
clusters = self.clustering_service.cluster(graph)
clusters = self.clustering_service.filter_noise(clusters)
# Stage 5: Reranking
ranked = self.ranking_service.rerank(clusters, seeds, query)
return ranked[:top_k]
```
### 2. Data Structures (`hybrid_search/data_structures.py`)
```python
@dataclass
class CodeSymbolNode:
"""Graph node representing a code symbol"""
id: str # Unique: file_path:name:line
name: str # Symbol name
kind: str # function, class, method, variable
file_path: str # Absolute file path
range: Range # Start/end line and character
embedding: Optional[List[float]] = None
raw_code: str = ""
docstring: str = ""
@dataclass
class CodeAssociationGraph:
"""Graph of code relationships"""
nodes: Dict[str, CodeSymbolNode]
edges: List[Tuple[str, str, str]] # (from_id, to_id, relationship_type)
# relationship_type: 'calls', 'references', 'inherits', 'imports'
def to_networkx(self) -> nx.DiGraph:
"""Convert to NetworkX for algorithms"""
...
@dataclass
class SearchResultCluster:
"""Clustered search result"""
cluster_id: str
score: float
title: str # AI-generated summary (optional)
symbols: List[CodeSymbolNode]
metadata: Dict[str, Any]
```
### 3. VectorSearchService (`services/vector_search.py`)
**Role**: Semantic search using embeddings
```python
class VectorSearchService:
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
self.model = SentenceTransformer(model_name) # 384-dim, fast
self.index: faiss.IndexFlatIP # or hnswlib for larger scale
self.id_to_symbol: Dict[str, CodeSymbolNode]
async def index_codebase(self, symbols: List[CodeSymbolNode]):
"""Build/update vector index from symbols"""
texts = [f"{s.name} {s.docstring} {s.raw_code[:500]}" for s in symbols]
embeddings = self.model.encode(texts, normalize_embeddings=True)
self.index.add(embeddings)
async def search(self, query: str, top_k: int) -> List[CodeSymbolNode]:
"""Find semantically similar symbols"""
query_vec = self.model.encode([query], normalize_embeddings=True)
scores, indices = self.index.search(query_vec, top_k)
return [self.id_to_symbol[i] for i in indices[0]]
```
**Embedding Model Selection**:
| Model | Dimensions | Speed | Quality |
|-------|-----------|-------|---------|
| all-MiniLM-L6-v2 | 384 | Fast | Good |
| all-mpnet-base-v2 | 768 | Medium | Better |
| CodeBERT | 768 | Medium | Code-optimized |
### 4. LspBridge (`services/lsp_bridge.py`)
**Role**: Interface to real-time language servers via LanguageServerMultiplexer
```python
class LspBridge:
def __init__(self, multiplexer_url: str = "http://localhost:3458"):
self.multiplexer_url = multiplexer_url
self.cache: Dict[str, CacheEntry] = {} # file_path -> (mtime, data)
self.session = aiohttp.ClientSession()
async def get_references(self, symbol: CodeSymbolNode) -> List[Location]:
"""Get all references to a symbol (real-time LSP)"""
cache_key = f"refs:{symbol.id}"
if self._is_cached(cache_key, symbol.file_path):
return self.cache[cache_key].data
response = await self._lsp_request("textDocument/references", {
"textDocument": {"uri": f"file://{symbol.file_path}"},
"position": {"line": symbol.range.start.line,
"character": symbol.range.start.character},
"context": {"includeDeclaration": True}
})
locations = self._parse_locations(response)
self._cache(cache_key, symbol.file_path, locations)
return locations
async def get_call_hierarchy(self, symbol: CodeSymbolNode) -> List[CallHierarchyItem]:
"""Get incoming/outgoing calls (if supported by language server)"""
try:
# Prepare call hierarchy
items = await self._lsp_request("textDocument/prepareCallHierarchy", {...})
if not items:
# Fallback to references if callHierarchy not supported
return await self._fallback_to_references(symbol)
# Get incoming calls
incoming = await self._lsp_request("callHierarchy/incomingCalls",
{"item": items[0]})
return incoming
except LspCapabilityNotSupported:
return await self._fallback_to_references(symbol)
async def get_definition(self, symbol: CodeSymbolNode) -> Optional[Location]:
"""Get symbol definition location"""
...
async def get_hover(self, symbol: CodeSymbolNode) -> Optional[str]:
"""Get hover documentation"""
...
```
**Caching Strategy**:
- Cache key: `{operation}:{symbol_id}`
- Invalidation: Check file modification time
- TTL: 5 minutes for frequently accessed files
**Concurrency Control**:
- Max concurrent LSP requests: 10
- Request timeout: 2 seconds
- Batch requests where possible
### 5. GraphBuilder (`graph/builder.py`)
**Role**: Build code association graph from seeds using LSP
```python
class GraphBuilder:
def __init__(self, max_depth: int = 2, max_nodes: int = 100):
self.max_depth = max_depth
self.max_nodes = max_nodes
async def build_from_seeds(
self,
seeds: List[CodeSymbolNode],
lsp_bridge: LspBridge
) -> CodeAssociationGraph:
"""Build association graph by expanding from seed nodes"""
graph = CodeAssociationGraph()
visited: Set[str] = set()
queue: List[Tuple[CodeSymbolNode, int]] = [(s, 0) for s in seeds]
# Parallel expansion with semaphore
sem = asyncio.Semaphore(10)
async def expand_node(node: CodeSymbolNode, depth: int):
if node.id in visited or depth > self.max_depth:
return
if len(graph.nodes) >= self.max_nodes:
return
visited.add(node.id)
graph.add_node(node)
async with sem:
# Get relationships in parallel
refs, calls = await asyncio.gather(
lsp_bridge.get_references(node),
lsp_bridge.get_call_hierarchy(node),
return_exceptions=True
)
# Add edges
for ref in refs:
ref_node = await self._location_to_node(ref, lsp_bridge)
graph.add_edge(node.id, ref_node.id, "references")
queue.append((ref_node, depth + 1))
for call in calls:
call_node = await self._call_to_node(call, lsp_bridge)
graph.add_edge(call_node.id, node.id, "calls")
queue.append((call_node, depth + 1))
# BFS expansion
while queue and len(graph.nodes) < self.max_nodes:
batch = queue[:10]
queue = queue[10:]
await asyncio.gather(*[expand_node(n, d) for n, d in batch])
return graph
```
### 6. ClusteringService (`clustering/algorithms.py`)
**Role**: Group related code symbols and filter noise
```python
class ClusteringService:
def __init__(self, resolution: float = 1.0):
self.resolution = resolution # Higher = smaller clusters
def cluster(self, graph: CodeAssociationGraph) -> List[SearchResultCluster]:
"""Apply Louvain community detection"""
nx_graph = graph.to_networkx()
# Louvain algorithm
communities = community_louvain.best_partition(
nx_graph,
resolution=self.resolution
)
# Group nodes by community
clusters: Dict[int, List[CodeSymbolNode]] = defaultdict(list)
for node_id, community_id in communities.items():
clusters[community_id].append(graph.nodes[node_id])
return [
SearchResultCluster(
cluster_id=f"cluster_{cid}",
symbols=nodes,
score=0.0, # Will be set by RankingService
title="",
metadata={"size": len(nodes)}
)
for cid, nodes in clusters.items()
]
def filter_noise(self, clusters: List[SearchResultCluster]) -> List[SearchResultCluster]:
"""Remove noisy clusters and symbols"""
filtered = []
for cluster in clusters:
# Filter high-degree generic nodes
cluster.symbols = [
s for s in cluster.symbols
if not self._is_generic_symbol(s)
]
# Keep clusters with minimum size
if len(cluster.symbols) >= 2:
filtered.append(cluster)
return filtered
def _is_generic_symbol(self, symbol: CodeSymbolNode) -> bool:
"""Check if symbol is too generic (log, print, etc.)"""
generic_names = {'log', 'print', 'debug', 'error', 'warn',
'get', 'set', 'init', '__init__', 'toString'}
return symbol.name.lower() in generic_names
```
### 7. RankingService (`ranking/service.py`)
**Role**: Multi-factor intelligent reranking
```python
@dataclass
class RankingWeights:
text_relevance: float = 0.4 # w1
graph_centrality: float = 0.35 # w2
structural_proximity: float = 0.25 # w3
class RankingService:
def __init__(self, weights: RankingWeights = None):
self.weights = weights or RankingWeights()
def rerank(
self,
clusters: List[SearchResultCluster],
seeds: List[CodeSymbolNode],
query: str
) -> List[SearchResultCluster]:
"""Rerank clusters using multi-factor scoring"""
seed_ids = {s.id for s in seeds}
for cluster in clusters:
# Build cluster subgraph for centrality
subgraph = self._build_subgraph(cluster)
pagerank = nx.pagerank(subgraph)
for symbol in cluster.symbols:
# Factor 1: Text relevance (from vector search)
text_score = self._compute_text_relevance(symbol, query)
# Factor 2: Graph centrality (PageRank in cluster)
centrality_score = pagerank.get(symbol.id, 0.0)
# Factor 3: Structural proximity to seeds
proximity_score = self._compute_proximity(symbol, seed_ids, subgraph)
# Combined score
symbol.score = (
self.weights.text_relevance * text_score +
self.weights.graph_centrality * centrality_score +
self.weights.structural_proximity * proximity_score
)
# Cluster score = max symbol score
cluster.score = max(s.score for s in cluster.symbols)
cluster.symbols.sort(key=lambda s: s.score, reverse=True)
# Sort clusters by score
clusters.sort(key=lambda c: c.score, reverse=True)
return clusters
def _compute_proximity(
self,
symbol: CodeSymbolNode,
seed_ids: Set[str],
graph: nx.DiGraph
) -> float:
"""Compute proximity score based on shortest path to seeds"""
if symbol.id in seed_ids:
return 1.0
min_distance = float('inf')
for seed_id in seed_ids:
try:
distance = nx.shortest_path_length(graph, seed_id, symbol.id)
min_distance = min(min_distance, distance)
except nx.NetworkXNoPath:
continue
if min_distance == float('inf'):
return 0.0
# Inverse distance scoring (closer = higher)
return 1.0 / (1.0 + min_distance)
```
## API Design
### Endpoint: `POST /api/v1/hybrid-search`
**Request**:
```json
{
"query": "user authentication flow",
"top_k": 10,
"config_overrides": {
"ranking_weights": {"w1": 0.5, "w2": 0.3, "w3": 0.2},
"max_graph_depth": 2,
"clustering_resolution": 1.0
}
}
```
**Response**:
```json
{
"query_id": "hs-20250120-001",
"execution_time_ms": 1250,
"results": [
{
"cluster_id": "cluster_0",
"score": 0.92,
"title": "User Authentication Handler",
"symbols": [
{
"id": "src/auth/handler.py:authenticate:45",
"name": "authenticate",
"kind": "function",
"file_path": "src/auth/handler.py",
"range": {"start": {"line": 45, "char": 0}, "end": {"line": 78, "char": 0}},
"score": 0.95,
"raw_code": "async def authenticate(request: Request):\n ..."
},
{
"id": "src/auth/handler.py:validate_token:80",
"name": "validate_token",
"kind": "function",
"file_path": "src/auth/handler.py",
"score": 0.88,
"raw_code": "def validate_token(token: str) -> bool:\n ..."
}
]
}
]
}
```
## Implementation Priorities
### P0 - Core Infrastructure (Week 1-2)
1. **HybridSearchEngine skeleton** - Basic orchestration without all features
2. **LspBridge with caching** - Connect to LanguageServerMultiplexer
3. **GraphBuilder basic** - Seed expansion with references only
4. **Integration test** - Verify LSP communication works
### P1 - Search Pipeline (Week 2-3)
1. **VectorSearchService** - Embedding model + FAISS index
2. **ClusteringService** - Louvain algorithm + noise filtering
3. **End-to-end pipeline** - Query to clustered results
### P2 - Ranking & API (Week 3-4)
1. **RankingService** - Multi-factor scoring
2. **API endpoint** - FastAPI integration
3. **Performance optimization** - Caching, parallelization, timeouts
4. **Configuration system** - Dynamic weight adjustment
## Performance Targets
| Metric | Target | Strategy |
|--------|--------|----------|
| End-to-end latency | < 2s | Parallel LSP calls, aggressive caching |
| Vector search | < 100ms | FAISS with GPU (optional) |
| LSP expansion | < 1s | Max 10 concurrent requests, 2s timeout |
| Clustering | < 200ms | Limit graph size to 100 nodes |
| Reranking | < 100ms | Pre-computed embeddings |
## Dependencies
### External
- LanguageServerMultiplexer (from REAL_LSP_SERVER_PLAN.md)
- Language servers: pylsp, tsserver, gopls, rust-analyzer
### Python Libraries
- `sentence-transformers` - Embedding models
- `faiss-cpu` or `hnswlib` - Vector indexing
- `networkx` - Graph algorithms
- `python-louvain` - Community detection
- `aiohttp` - Async HTTP client
## File Structure
```
src/codexlens/
├── hybrid_search/
│ ├── __init__.py
│ ├── engine.py # HybridSearchEngine
│ ├── pipeline.py # Pipeline stage definitions
│ └── data_structures.py # CodeSymbolNode, Graph, Cluster
├── services/
│ ├── vector_search.py # VectorSearchService
│ └── lsp_bridge.py # LspBridge
├── graph/
│ └── builder.py # GraphBuilder
├── clustering/
│ └── algorithms.py # ClusteringService
├── ranking/
│ └── service.py # RankingService
├── api/
│ └── endpoints.py # API routes
└── configs/
└── hybrid_search_config.py
```
## Risk Mitigation
| Risk | Impact | Mitigation |
|------|--------|------------|
| LSP timeout | High | Fallback to vector-only results |
| LSP not available | High | Graceful degradation to CodexLens index |
| Large codebases | Medium | Limit graph expansion, pagination |
| Language server crash | Medium | Auto-restart, circuit breaker |
| Clustering quality | Low | Tunable resolution parameter |
---
*Generated from Gemini analysis (Session: 1768836775699-gemini)*
*Date: 2025-01-20*

View File

@@ -1,363 +0,0 @@
# CodexLens Real LSP Implementation - Summary
> **Date**: 2026-01-19
> **Status**: Planning Complete, Implementation Ready
> **Focus**: Real LSP Server + VSCode Bridge Integration
---
## ✅ Completed Work
### 1. Planning Documents
#### a. Main Implementation Plan
**File**: `docs/REAL_LSP_SERVER_PLAN.md`
**Content**:
- Complete architecture design for real LSP server
- 5-phase implementation plan
- Multi-language support strategy (TypeScript, Python, Go, Rust, Java, C/C++)
- Language server multiplexer design
- Position tolerance feature (cclsp-like)
- MCP integration layer
**Key Decisions**:
- Use `pygls` library for LSP implementation
- Support 6+ language servers via multiplexer
- Implement position tolerance for fuzzy AI-generated positions
- Three integration paths: Standalone LSP, VSCode Bridge, Index-based fallback
#### b. VSCode Bridge Implementation (Appendix A)
**Included in**: `docs/REAL_LSP_SERVER_PLAN.md`
**Content**:
- HTTP-based VSCode extension bridge
- MCP tool integration (vscode_lsp)
- Complete architecture diagram
- API endpoint specifications
- Comparison with standalone LSP approach
### 2. VSCode Bridge Extension
#### Created Files:
1. **`ccw-vscode-bridge/package.json`**
- VSCode extension manifest
- Dependencies: @types/node, @types/vscode, typescript
2. **`ccw-vscode-bridge/tsconfig.json`**
- TypeScript compilation configuration
- Target: ES2020, CommonJS modules
3. **`ccw-vscode-bridge/src/extension.ts`**
- HTTP server on port 3457
- 4 API endpoints:
- `POST /get_definition`
- `POST /get_references`
- `POST /get_hover`
- `POST /get_document_symbols`
- VSCode API integration via `vscode.commands.executeCommand`
4. **`ccw-vscode-bridge/.vscodeignore`**
- Build artifact exclusion rules
5. **`ccw-vscode-bridge/README.md`**
- Installation & usage instructions
- API endpoint documentation
#### Features:
- ✅ Real-time VSCode LSP integration
- ✅ HTTP REST API for external tools
- ✅ CORS support
- ✅ Error handling
- ✅ Automatic VSCode feature detection
### 3. CCW MCP Tool
#### Created File:
**`ccw/src/tools/vscode-lsp.ts`**
**Features**:
- ✅ 4 LSP actions: get_definition, get_references, get_hover, get_document_symbols
- ✅ Zod schema validation
- ✅ HTTP client with timeout (10s)
- ✅ Connection retry logic
- ✅ Comprehensive error messages
**Parameters**:
- `action` (required): LSP action type
- `file_path` (required): Absolute file path
- `line` (optional): Line number (1-based)
- `character` (optional): Character position (1-based)
#### Integration:
**Modified File**: `ccw/src/tools/index.ts`
- ✅ Imported `vscodeLspMod`
- ✅ Registered tool via `registerTool(toLegacyTool(vscodeLspMod))`
- ✅ Available in MCP server tool list
---
## 📋 Implementation Architecture
### Three Integration Paths
```
Path 1: VSCode Bridge (✅ Implemented)
─────────────────────────────────────
Claude Code → vscode_lsp MCP tool → HTTP → ccw-vscode-bridge → VSCode API → Language Servers
Path 2: Standalone LSP Server (📝 Planned)
──────────────────────────────────────────
Any LSP Client → codexlens-lsp → Language Server Multiplexer → Language Servers
Path 3: Index-Based (✅ Existing)
─────────────────────────────────
Claude Code → codex_lens_lsp → Python API → SQLite Index → Cached Results
```
### Smart Routing Strategy
```javascript
// Priority: VSCode Bridge → Standalone LSP → Index-based
if (vscodeBridgeAvailable) {
return useVSCodeBridge();
} else if (standaloneLSPAvailable) {
return useStandaloneLSP();
} else {
return useIndexBased();
}
```
---
## 🎯 Next Steps
### Immediate Actions (Phase 1)
1. **Test VSCode Bridge**
```bash
cd ccw-vscode-bridge
npm install
npm run compile
# Press F5 in VSCode to launch extension
```
2. **Test vscode_lsp Tool**
```bash
# Start CCW MCP server
cd ccw
npm run mcp
# Test via MCP client
{
"tool": "vscode_lsp",
"arguments": {
"action": "get_definition",
"file_path": "/path/to/file.ts",
"line": 10,
"character": 5
}
}
```
3. **Document Testing Results**
- Create test reports
- Benchmark latency
- Validate accuracy
### Medium-Term Goals (Phase 2-3)
1. **Implement Standalone LSP Server**
- Setup `codexlens-lsp` project structure
- Implement language server multiplexer
- Add core LSP handlers
2. **Add Position Tolerance**
- Implement fuzzy position matching
- Test with AI-generated positions
3. **Create Integration Tests**
- Unit tests for each component
- E2E tests with real language servers
- Performance benchmarks
### Long-Term Goals (Phase 4-5)
1. **MCP Context Enhancement**
- Integrate LSP results into MCP context
- Hook system for Claude Code
2. **Advanced Features**
- Code actions
- Formatting
- Rename support
3. **Production Deployment**
- Package VSCode extension to .vsix
- Publish to VS Code marketplace
- Create installation scripts
---
## 📊 Project Status Matrix
| Component | Status | Files | Tests | Docs |
|-----------|--------|-------|-------|------|
| VSCode Bridge Extension | ✅ Complete | 5/5 | ⏳ Pending | ✅ Complete |
| vscode_lsp MCP Tool | ✅ Complete | 1/1 | ⏳ Pending | ✅ Complete |
| Tool Registration | ✅ Complete | 1/1 | N/A | N/A |
| Planning Documents | ✅ Complete | 2/2 | N/A | ✅ Complete |
| Standalone LSP Server | 📝 Planned | 0/8 | 0/12 | ✅ Complete |
| Integration Tests | 📝 Planned | 0/3 | 0/15 | ⏳ Pending |
---
## 🔧 Development Environment
### Prerequisites
**For VSCode Bridge**:
- Node.js ≥ 18
- VSCode ≥ 1.80
- TypeScript ≥ 5.0
**For Standalone LSP**:
- Python ≥ 3.8
- pygls ≥ 1.3.0
- Language servers:
- TypeScript: `npm i -g typescript-language-server`
- Python: `pip install python-lsp-server`
- Go: `go install golang.org/x/tools/gopls@latest`
- Rust: `rustup component add rust-analyzer`
### Installation Commands
```bash
# VSCode Bridge
cd ccw-vscode-bridge
npm install
npm run compile
# CCW MCP (already setup)
cd ccw
npm install
# Future: Standalone LSP
cd codex-lens
pip install -e ".[lsp]"
```
---
## 📖 Documentation Index
| Document | Purpose | Status |
|----------|---------|--------|
| `REAL_LSP_SERVER_PLAN.md` | Complete implementation plan | ✅ |
| `LSP_INTEGRATION_PLAN.md` | Original integration strategy | ✅ |
| `MCP_ENDPOINT_DESIGN.md` | MCP endpoint specifications | ✅ |
| `IMPLEMENTATION_SUMMARY.md` | This document | ✅ |
| `ccw-vscode-bridge/README.md` | Bridge usage guide | ✅ |
| `TESTING_GUIDE.md` | Testing procedures | ⏳ TODO |
| `DEPLOYMENT_GUIDE.md` | Production deployment | ⏳ TODO |
---
## 💡 Key Design Decisions
### 1. Why Three Integration Paths?
- **VSCode Bridge**: Easiest setup, leverages VSCode's built-in language servers
- **Standalone LSP**: IDE-agnostic, works with any LSP client
- **Index-based**: Fallback for offline or cached queries
### 2. Why HTTP for VSCode Bridge?
- ✅ Simplest cross-process communication
- ✅ No complex IPC/socket management
- ✅ Easy to debug with curl/Postman
- ✅ CORS support for web-based tools
### 3. Why Port 3457?
- Unique port unlikely to conflict
- Easy to remember (345-7)
- Same approach as cclsp (uses stdio)
### 4. Why Not Modify smart_search?
User feedback:
> "第一种跟当前的符号搜索没区别哎"
> (Method 1 has no difference from current symbol search)
**Solution**: Implement real LSP server that connects to live language servers, not pre-indexed data.
---
## 🚀 Quick Start Guide
### Test VSCode Bridge Now
1. **Install Extension**:
```bash
cd ccw-vscode-bridge
npm install && npm run compile
code --install-extension .
```
2. **Reload VSCode**:
- Press `Cmd+Shift+P` (Mac) or `Ctrl+Shift+P` (Windows)
- Type "Reload Window"
3. **Verify Bridge is Running**:
```bash
curl http://localhost:3457/get_definition \
-X POST \
-H "Content-Type: application/json" \
-d '{"file_path":"/path/to/file.ts","line":10,"character":5}'
```
4. **Test via CCW**:
```javascript
// In Claude Code or MCP client
await executeTool('vscode_lsp', {
action: 'get_definition',
file_path: '/absolute/path/to/file.ts',
line: 10,
character: 5
});
```
---
## 📞 Support & Troubleshooting
### Common Issues
**Issue**: "Could not connect to VSCode Bridge"
**Solution**:
1. Ensure VSCode is running
2. Check if extension is activated: `Cmd+Shift+P` → "CCW VSCode Bridge"
3. Verify port 3457 is not in use: `lsof -i :3457`
**Issue**: "No LSP server available"
**Solution**: Open the file in VSCode workspace first
**Issue**: "File not found"
**Solution**: Use absolute paths, not relative
---
## 📝 Change Log
### 2026-01-19 - Initial Implementation
- Created VSCode Bridge extension (5 files)
- Implemented vscode_lsp MCP tool
- Registered tool in CCW registry
- Completed planning documentation
- Added comprehensive architecture diagrams
---
**Document End**

View File

@@ -1,342 +0,0 @@
# LLM增强功能移除总结
**移除日期**: 2025-12-16
**执行者**: 用户请求
**状态**: ✅ 完成
---
## 📋 移除清单
### ✅ 已删除的源代码文件
| 文件 | 说明 |
|------|------|
| `src/codexlens/semantic/llm_enhancer.py` | LLM增强核心模块 (900+ lines) |
### ✅ 已修改的源代码文件
| 文件 | 修改内容 |
|------|---------|
| `src/codexlens/cli/commands.py` | 删除 `enhance` 命令 (lines 1050-1227) |
| `src/codexlens/semantic/__init__.py` | 删除LLM相关导出 (lines 35-69) |
### ✅ 已修改的前端文件CCW Dashboard
| 文件 | 修改内容 |
|------|---------|
| `ccw/src/templates/dashboard-js/components/cli-status.js` | 删除LLM增强设置 (8行)、Semantic Settings Modal (615行)、Metadata Viewer (326行) |
| `ccw/src/templates/dashboard-js/i18n.js` | 删除英文LLM翻译 (26行)、中文LLM翻译 (26行) |
| `ccw/src/templates/dashboard-js/views/cli-manager.js` | 移除LLM badge和设置modal调用 (3行) |
### ✅ 已删除的测试文件
| 文件 | 说明 |
|------|------|
| `tests/test_llm_enhancer.py` | LLM增强单元测试 |
| `tests/test_llm_enhanced_search.py` | LLM vs 纯向量对比测试 (550+ lines) |
### ✅ 已删除的脚本文件
| 文件 | 说明 |
|------|------|
| `scripts/compare_search_methods.py` | 纯向量 vs LLM增强对比脚本 (460+ lines) |
| `scripts/test_misleading_comments.py` | 误导性注释测试脚本 (490+ lines) |
| `scripts/show_llm_analysis.py` | LLM分析展示工具 |
| `scripts/inspect_llm_summaries.py` | LLM摘要检查工具 |
### ✅ 已删除的文档文件
| 文件 | 说明 |
|------|------|
| `docs/LLM_ENHANCED_SEARCH_GUIDE.md` | LLM增强使用指南 (460+ lines) |
| `docs/LLM_ENHANCEMENT_TEST_RESULTS.md` | LLM测试结果文档 |
| `docs/MISLEADING_COMMENTS_TEST_RESULTS.md` | 误导性注释测试结果 |
| `docs/CLI_INTEGRATION_SUMMARY.md` | CLI集成文档包含enhance命令 |
| `docs/DOCSTRING_LLM_HYBRID_DESIGN.md` | Docstring与LLM混合策略设计 |
### ✅ 已更新的文档
| 文件 | 修改内容 |
|------|---------|
| `docs/IMPLEMENTATION_SUMMARY.md` | 添加LLM移除说明列出已删除内容 |
### 📚 保留的设计文档(作为历史参考)
| 文件 | 说明 |
|------|------|
| `docs/DESIGN_EVALUATION_REPORT.md` | 包含LLM混合策略的技术评估报告 |
| `docs/SEMANTIC_GRAPH_DESIGN.md` | 语义图谱设计可能提及LLM |
| `docs/MULTILEVEL_CHUNKER_DESIGN.md` | 多层次分词器设计可能提及LLM |
*这些文档保留作为技术历史参考,不影响当前功能。*
---
## 🔒 移除的功能
### CLI命令
```bash
# 已移除 - 不再可用
codexlens enhance [PATH] --tool gemini --batch-size 5
# 说明此命令用于通过CCW CLI调用Gemini/Qwen生成代码摘要
# 移除原因:减少外部依赖,简化维护
```
### Python API
```python
# 已移除 - 不再可用
from codexlens.semantic import (
LLMEnhancer,
LLMConfig,
SemanticMetadata,
FileData,
EnhancedSemanticIndexer,
create_enhancer,
create_enhanced_indexer,
)
# 移除的类和函数:
# - LLMEnhancer: LLM增强器主类
# - LLMConfig: LLM配置类
# - SemanticMetadata: 语义元数据结构
# - FileData: 文件数据结构
# - EnhancedSemanticIndexer: LLM增强索引器
# - create_enhancer(): 创建增强器的工厂函数
# - create_enhanced_indexer(): 创建增强索引器的工厂函数
```
---
## ✅ 保留的功能
### 完全保留的核心功能
| 功能 | 状态 |
|------|------|
| **纯向量搜索** | ✅ 完整保留 |
| **语义嵌入生成** | ✅ 完整保留 (`codexlens embeddings-generate`) |
| **语义嵌入状态检查** | ✅ 完整保留 (`codexlens embeddings-status`) |
| **混合搜索引擎** | ✅ 完整保留exact + fuzzy + vector |
| **向量存储** | ✅ 完整保留 |
| **语义分块** | ✅ 完整保留 |
| **fastembed集成** | ✅ 完整保留 |
### 可用的CLI命令
```bash
# 生成纯向量嵌入无需LLM
codexlens embeddings-generate [PATH]
# 检查嵌入状态
codexlens embeddings-status [PATH]
# 所有搜索命令
codexlens search [QUERY] --index [PATH]
# 所有索引管理命令
codexlens init [PATH]
codexlens update [PATH]
codexlens clean [PATH]
```
### 可用的Python API
```python
# 完全可用 - 纯向量搜索
from codexlens.semantic import SEMANTIC_AVAILABLE, SEMANTIC_BACKEND
from codexlens.semantic.embedder import Embedder
from codexlens.semantic.vector_store import VectorStore
from codexlens.semantic.chunker import Chunker, ChunkConfig
from codexlens.search.hybrid_search import HybridSearchEngine
# 示例:纯向量搜索
engine = HybridSearchEngine()
results = engine.search(
index_path,
query="your search query",
enable_vector=True,
pure_vector=True, # 纯向量模式
)
```
---
## 🎯 移除原因
### 1. 简化依赖
**移除的外部依赖**:
- CCW CLI (npm package)
- Gemini API (需要API密钥)
- Qwen API (可选)
**保留的依赖**:
- fastembed (ONNX-based轻量级)
- numpy
- Python标准库
### 2. 减少复杂性
- **前**: 两种搜索方式(纯向量 + LLM增强
- **后**: 一种搜索方式(纯向量)
- 移除了900+ lines的LLM增强代码
- 移除了CLI命令和相关配置
- 移除了测试和文档
### 3. 性能考虑
| 方面 | LLM增强 | 纯向量 |
|------|---------|--------|
| **索引速度** | 慢75倍 | 基准 |
| **查询速度** | 相同 | 相同 |
| **准确率** | 相同* | 基准 |
| **成本** | API费用 | 免费 |
*在测试数据集上准确率相同5/5但LLM增强理论上在更复杂场景下可能更好
### 4. 维护负担
**移除前**:
- 需要维护CCW CLI集成
- 需要处理API限流和错误
- 需要测试多个LLM后端
- 需要维护批处理逻辑
**移除后**:
- 单一嵌入引擎fastembed
- 无外部API依赖
- 更简单的错误处理
- 更容易测试
---
## 🔍 验证结果
### 导入测试
```bash
# ✅ 通过 - 语义模块正常
python -c "from codexlens.semantic import SEMANTIC_AVAILABLE; print(SEMANTIC_AVAILABLE)"
# Output: True
# ✅ 通过 - 搜索引擎正常
python -c "from codexlens.search.hybrid_search import HybridSearchEngine; print('OK')"
# Output: OK
```
### 代码清洁度验证
```bash
# ✅ 通过 - 无遗留LLM引用
grep -r "llm_enhancer\|LLMEnhancer\|LLMConfig" src/ --include="*.py"
# Output: (空)
```
### 测试结果
```bash
# ✅ 5/7通过 - 纯向量搜索基本功能正常
pytest tests/test_pure_vector_search.py -v
# 通过: 5个基本测试
# 失败: 2个嵌入测试已知的模型维度不匹配问题与LLM移除无关
```
---
## 📊 统计
### 代码删除统计
| 类型 | 删除文件数 | 删除行数(估计) |
|------|-----------|-----------------|
| **源代码** | 1 | ~900 lines |
| **CLI命令** | 1 command | ~180 lines |
| **导出清理** | 1 section | ~35 lines |
| **前端代码** | 3 files | ~1000 lines |
| **测试文件** | 2 | ~600 lines |
| **脚本工具** | 4 | ~1500 lines |
| **文档** | 5 | ~2000 lines |
| **总计** | 16 files/sections | ~6200 lines |
### 依赖简化
| 方面 | 移除前 | 移除后 |
|------|--------|--------|
| **外部工具依赖** | CCW CLI, Gemini/Qwen | 无 |
| **Python包依赖** | fastembed, numpy | fastembed, numpy |
| **API依赖** | Gemini/Qwen API | 无 |
| **配置复杂度** | 高tool, batch_size, API keys | 低model profile |
---
## 🚀 后续建议
### 如果需要LLM增强功能
1. **从git历史恢复**
```bash
# 查看删除前的提交
git log --all --full-history -- "*llm_enhancer*"
# 恢复特定文件
git checkout <commit-hash> -- src/codexlens/semantic/llm_enhancer.py
```
2. **或使用外部工具**
- 在索引前使用独立脚本生成摘要
- 将摘要作为注释添加到代码中
- 然后使用纯向量索引(会包含摘要)
3. **或考虑轻量级替代方案**
- 使用本地小模型llama.cpp, ggml
- 使用docstring提取无需LLM
- 使用静态分析生成摘要
### 代码库维护建议
1. ✅ **保持简单** - 继续使用纯向量搜索
2. ✅ **优化现有功能** - 改进向量搜索准确性
3. ✅ **增量改进** - 优化分块策略和嵌入质量
4. ⚠️ **避免重复** - 如需LLM先评估是否真正必要
---
## 📝 文件清单
### 删除的文件完整列表
```
src/codexlens/semantic/llm_enhancer.py
tests/test_llm_enhancer.py
tests/test_llm_enhanced_search.py
scripts/compare_search_methods.py
scripts/test_misleading_comments.py
scripts/show_llm_analysis.py
scripts/inspect_llm_summaries.py
docs/LLM_ENHANCED_SEARCH_GUIDE.md
docs/LLM_ENHANCEMENT_TEST_RESULTS.md
docs/MISLEADING_COMMENTS_TEST_RESULTS.md
docs/CLI_INTEGRATION_SUMMARY.md
docs/DOCSTRING_LLM_HYBRID_DESIGN.md
```
### 修改的文件
```
src/codexlens/cli/commands.py (删除enhance命令)
src/codexlens/semantic/__init__.py (删除LLM导出)
ccw/src/templates/dashboard-js/components/cli-status.js (删除LLM配置、Settings Modal、Metadata Viewer)
ccw/src/templates/dashboard-js/i18n.js (删除LLM翻译字符串)
ccw/src/templates/dashboard-js/views/cli-manager.js (移除LLM badge和modal调用)
docs/IMPLEMENTATION_SUMMARY.md (添加移除说明)
```
---
**移除完成时间**: 2025-12-16
**文档版本**: 1.0
**验证状态**: ✅ 通过

View File

@@ -1,316 +0,0 @@
# codex-lens LSP Integration Execution Checklist
> Generated: 2026-01-15
> Based on: Gemini multi-round deep analysis
> Status: Ready for implementation
---
## Phase 1: LSP Server Foundation (Priority: HIGH)
### 1.1 Create LSP Server Entry Point
- [ ] **Install pygls dependency**
```bash
pip install pygls
```
- [ ] **Create `src/codexlens/lsp/__init__.py`**
- Export: `CodexLensServer`, `start_server`
- [ ] **Create `src/codexlens/lsp/server.py`**
- Class: `CodexLensServer(LanguageServer)`
- Initialize: `ChainSearchEngine`, `GlobalSymbolIndex`, `WatcherManager`
- Lifecycle: Start `WatcherManager` on `initialize` request
### 1.2 Implement Core LSP Handlers
- [ ] **`textDocument/definition`** handler
- Source: `GlobalSymbolIndex.search()` exact match
- Reference: `storage/global_index.py:173`
- Return: `Location(uri, Range)`
- [ ] **`textDocument/completion`** handler
- Source: `GlobalSymbolIndex.search(prefix_mode=True)`
- Reference: `storage/global_index.py:173`
- Return: `CompletionItem[]`
- [ ] **`workspace/symbol`** handler
- Source: `ChainSearchEngine.search_symbols()`
- Reference: `search/chain_search.py:618`
- Return: `SymbolInformation[]`
### 1.3 Wire File Watcher to LSP Events
- [ ] **`workspace/didChangeWatchedFiles`** handler
- Delegate to: `WatcherManager.process_changes()`
- Reference: `watcher/manager.py:53`
- [ ] **`textDocument/didSave`** handler
- Trigger: `IncrementalIndexer` for single file
- Reference: `watcher/incremental_indexer.py`
### 1.4 Deliverables
- [ ] Unit tests for LSP handlers
- [ ] Integration test: definition lookup
- [ ] Integration test: completion prefix search
- [ ] Benchmark: query latency < 50ms
---
## Phase 2: Find References Implementation (Priority: MEDIUM)
### 2.1 Create `search_references` Method
- [ ] **Add to `src/codexlens/search/chain_search.py`**
```python
def search_references(
self,
symbol_name: str,
source_path: Path,
depth: int = -1
) -> List[ReferenceResult]:
"""Find all references to a symbol across the project."""
```
### 2.2 Implement Parallel Query Orchestration
- [ ] **Collect index paths**
- Use: `_collect_index_paths()` existing method
- [ ] **Parallel query execution**
- ThreadPoolExecutor across all `_index.db`
- SQL: `SELECT * FROM code_relationships WHERE target_qualified_name = ?`
- Reference: `storage/sqlite_store.py:348`
- [ ] **Result aggregation**
- Deduplicate by file:line
- Sort by file path, then line number
### 2.3 LSP Handler
- [ ] **`textDocument/references`** handler
- Call: `ChainSearchEngine.search_references()`
- Return: `Location[]`
### 2.4 Deliverables
- [ ] Unit test: single-index reference lookup
- [ ] Integration test: cross-directory references
- [ ] Benchmark: < 200ms for 10+ index files
---
## Phase 3: Enhanced Hover Information (Priority: MEDIUM)
### 3.1 Implement Hover Data Extraction
- [ ] **Create `src/codexlens/lsp/hover_provider.py`**
```python
class HoverProvider:
def get_hover_info(self, symbol: Symbol) -> HoverInfo:
"""Extract hover information for a symbol."""
```
### 3.2 Data Sources
- [ ] **Symbol metadata**
- Source: `GlobalSymbolIndex.search()`
- Fields: `kind`, `name`, `file_path`, `range`
- [ ] **Source code extraction**
- Source: `SQLiteStore.files` table
- Reference: `storage/sqlite_store.py:284`
- Extract: Lines from `range[0]` to `range[1]`
### 3.3 LSP Handler
- [ ] **`textDocument/hover`** handler
- Return: `Hover(contents=MarkupContent)`
- Format: Markdown with code fence
### 3.4 Deliverables
- [ ] Unit test: hover for function/class/variable
- [ ] Integration test: multi-line function signature
---
## Phase 4: MCP Bridge for Claude Code (Priority: HIGH VALUE)
### 4.1 Define MCP Schema
- [ ] **Create `src/codexlens/mcp/__init__.py`**
- [ ] **Create `src/codexlens/mcp/schema.py`**
```python
@dataclass
class MCPContext:
version: str = "1.0"
context_type: str
symbol: Optional[SymbolInfo]
definition: Optional[str]
references: List[ReferenceInfo]
related_symbols: List[SymbolInfo]
```
### 4.2 Create MCP Provider
- [ ] **Create `src/codexlens/mcp/provider.py`**
```python
class MCPProvider:
def build_context(
self,
symbol_name: str,
context_type: str = "symbol_explanation"
) -> MCPContext:
"""Build structured context for LLM consumption."""
```
### 4.3 Context Building Logic
- [ ] **Symbol lookup**
- Use: `GlobalSymbolIndex.search()`
- [ ] **Definition extraction**
- Use: `SQLiteStore` file content
- [ ] **References collection**
- Use: `ChainSearchEngine.search_references()`
- [ ] **Related symbols**
- Use: `code_relationships` for imports/calls
### 4.4 Hook Integration Points
- [ ] **Document `pre-tool` hook interface**
```python
def pre_tool_hook(action: str, params: dict) -> MCPContext:
"""Called before LLM action to gather context."""
```
- [ ] **Document `post-tool` hook interface**
```python
def post_tool_hook(action: str, result: Any) -> None:
"""Called after LSP action for proactive caching."""
```
### 4.5 Deliverables
- [ ] MCP schema JSON documentation
- [ ] Unit test: context building
- [ ] Integration test: hook → MCP → JSON output
---
## Phase 5: Advanced Features (Priority: LOW)
### 5.1 Custom LSP Commands
- [ ] **`codexlens/hybridSearch`**
- Expose: `HybridSearchEngine.search()`
- Reference: `search/hybrid_search.py`
- [ ] **`codexlens/symbolGraph`**
- Return: Symbol relationship graph
- Source: `code_relationships` table
### 5.2 Proactive Context Caching
- [ ] **Implement `post-tool` hook caching**
- After `go-to-definition`: pre-fetch references
- Cache TTL: 5 minutes
- Storage: In-memory LRU
### 5.3 Performance Optimizations
- [ ] **Connection pooling**
- Reference: `storage/sqlite_store.py` thread-local
- [ ] **Result caching**
- LRU cache for frequent queries
- Invalidate on file change
---
## File Structure After Implementation
```
src/codexlens/
├── lsp/ # NEW
│ ├── __init__.py
│ ├── server.py # Main LSP server
│ ├── handlers.py # LSP request handlers
│ ├── hover_provider.py # Hover information
│ └── utils.py # LSP utilities
├── mcp/ # NEW
│ ├── __init__.py
│ ├── schema.py # MCP data models
│ ├── provider.py # Context builder
│ └── hooks.py # Hook interfaces
├── search/
│ ├── chain_search.py # MODIFY: add search_references()
│ └── ...
└── ...
```
---
## Dependencies to Add
```toml
# pyproject.toml
[project.optional-dependencies]
lsp = [
"pygls>=1.3.0",
]
```
---
## Testing Strategy
### Unit Tests
```
tests/
├── lsp/
│ ├── test_definition.py
│ ├── test_completion.py
│ ├── test_references.py
│ └── test_hover.py
└── mcp/
├── test_schema.py
└── test_provider.py
```
### Integration Tests
- [ ] Full LSP handshake test
- [ ] Multi-file project navigation
- [ ] Incremental index update via didSave
### Performance Benchmarks
| Operation | Target | Acceptable |
|-----------|--------|------------|
| Definition lookup | < 30ms | < 50ms |
| Completion (100 items) | < 50ms | < 100ms |
| Find references (10 files) | < 150ms | < 200ms |
| Initial indexing (1000 files) | < 60s | < 120s |
---
## Execution Order
```
Week 1: Phase 1.1 → 1.2 → 1.3 → 1.4
Week 2: Phase 2.1 → 2.2 → 2.3 → 2.4
Week 3: Phase 3 + Phase 4.1 → 4.2
Week 4: Phase 4.3 → 4.4 → 4.5
Week 5: Phase 5 (optional) + Polish
```
---
## Quick Start Commands
```bash
# Install LSP dependencies
pip install pygls
# Run LSP server (after implementation)
python -m codexlens.lsp --stdio
# Test LSP connection
echo '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}' | python -m codexlens.lsp --stdio
```
---
## Reference Links
- pygls Documentation: https://pygls.readthedocs.io/
- LSP Specification: https://microsoft.github.io/language-server-protocol/
- codex-lens GlobalSymbolIndex: `storage/global_index.py:173`
- codex-lens ChainSearchEngine: `search/chain_search.py:618`
- codex-lens WatcherManager: `watcher/manager.py:53`

File diff suppressed because it is too large Load Diff

View File

@@ -1,284 +0,0 @@
# CodexLens MCP Endpoint Design
> Generated by Gemini Analysis | 2026-01-19
> Document Version: 1.0
## Overview
This document provides the complete MCP endpoint design for exposing codex-lens LSP capabilities through the Model Context Protocol.
## Related Files
- `src/codexlens/lsp/server.py` - Main LSP server initialization, component management, and capability declaration.
- `src/codexlens/lsp/handlers.py` - Implementation of handlers for core LSP requests (definition, references, completion, hover, workspace symbols).
- `src/codexlens/lsp/providers.py` - Helper classes, specifically `HoverProvider` for generating rich hover information.
- `src/codexlens/storage/global_index.py` - The backing data store (`GlobalSymbolIndex`) that powers most of the symbol lookups.
- `src/codexlens/search/__init__.py` - Exposes the `ChainSearchEngine`, used for advanced reference searching.
## Summary
The `codex-lens` LSP implementation exposes five core code navigation and search features: go to definition, find references, code completion, hover information, and workspace symbol search. These features are primarily powered by two components: `GlobalSymbolIndex` for fast, project-wide symbol lookups (used by definition, completion, hover, and workspace symbols) and `ChainSearchEngine` for advanced, relationship-aware reference finding.
The following MCP tool design externalizes these backend capabilities, allowing a client to leverage the same code intelligence features outside of an LSP context.
## MCP Tool Group: `code.symbol`
This group provides tools for searching and retrieving information about code symbols (functions, classes, etc.) within an indexed project.
---
### 1. `code.symbol.search`
**Description**: Searches for symbols across the entire indexed project, supporting prefix or contains matching. Ideal for implementing workspace symbol searches or providing code completion suggestions.
**Mapped LSP Features**: `workspace/symbol`, `textDocument/completion`
**Backend Implementation**: This tool directly maps to the `GlobalSymbolIndex.search` method.
- Reference: `src/codexlens/lsp/handlers.py:302` (in `lsp_workspace_symbol`)
- Reference: `src/codexlens/lsp/handlers.py:256` (in `lsp_completion`)
**Schema**:
```json
{
"name": "code.symbol.search",
"description": "Searches for symbols across the entire indexed project, supporting prefix or contains matching. Ideal for implementing workspace symbol searches or providing code completion suggestions.",
"inputSchema": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The symbol name or prefix to search for."
},
"kind": {
"type": "string",
"description": "Optional: Filter results to only include symbols of a specific kind (e.g., 'function', 'class', 'method').",
"nullable": true
},
"prefix_mode": {
"type": "boolean",
"description": "If true, treats the query as a prefix (name LIKE 'query%'). If false, performs a contains search (name LIKE '%query%'). Defaults to true.",
"default": true
},
"limit": {
"type": "integer",
"description": "The maximum number of symbols to return.",
"default": 50
}
},
"required": ["query"]
}
}
```
**Returns**:
```typescript
Array<{
name: string; // The name of the symbol
kind: string; // The kind of the symbol (e.g., 'function', 'class')
file_path: string; // The absolute path to the file containing the symbol
range: {
start_line: number; // The 1-based starting line number
end_line: number; // The 1-based ending line number
}
}>
```
---
### 2. `code.symbol.findDefinition`
**Description**: Finds the definition location(s) for a symbol with an exact name match. This corresponds to a 'Go to Definition' feature.
**Mapped LSP Feature**: `textDocument/definition`
**Backend Implementation**: This tool uses `GlobalSymbolIndex.search` with `prefix_mode=False` and then filters for an exact name match.
- Reference: `src/codexlens/lsp/handlers.py:180` (in `lsp_definition`)
**Schema**:
```json
{
"name": "code.symbol.findDefinition",
"description": "Finds the definition location(s) for a symbol with an exact name match. This corresponds to a 'Go to Definition' feature.",
"inputSchema": {
"type": "object",
"properties": {
"symbol_name": {
"type": "string",
"description": "The exact name of the symbol to find."
},
"kind": {
"type": "string",
"description": "Optional: Disambiguate by providing the symbol kind (e.g., 'function', 'class').",
"nullable": true
}
},
"required": ["symbol_name"]
}
}
```
**Returns**:
```typescript
Array<{
name: string; // The name of the symbol
kind: string; // The kind of the symbol
file_path: string; // The absolute path to the file
range: {
start_line: number; // The 1-based starting line number
end_line: number; // The 1-based ending line number
}
}>
```
---
### 3. `code.symbol.findReferences`
**Description**: Finds all references to a symbol throughout the project. Uses advanced relationship analysis for accuracy where possible, falling back to name-based search.
**Mapped LSP Feature**: `textDocument/references`
**Backend Implementation**: This primarily uses `ChainSearchEngine.search_references` for accuracy, which is more powerful than a simple name search.
- Reference: `src/codexlens/lsp/handlers.py:218` (in `lsp_references`)
**Schema**:
```json
{
"name": "code.symbol.findReferences",
"description": "Finds all references to a symbol throughout the project. Uses advanced relationship analysis for accuracy where possible.",
"inputSchema": {
"type": "object",
"properties": {
"symbol_name": {
"type": "string",
"description": "The name of the symbol to find references for."
},
"context_path": {
"type": "string",
"description": "The source path of the current project or workspace root to provide context for the search."
},
"limit": {
"type": "integer",
"description": "The maximum number of references to return.",
"default": 200
}
},
"required": ["symbol_name", "context_path"]
}
}
```
**Returns**:
```typescript
Array<{
file_path: string; // The absolute path to the file containing the reference
line: number; // The 1-based line number of the reference
column: number; // The 0-based starting column of the reference
}>
```
---
### 4. `code.symbol.getHoverInfo`
**Description**: Retrieves rich information for a symbol, including its signature and location, suitable for displaying in a hover card.
**Mapped LSP Feature**: `textDocument/hover`
**Backend Implementation**: This tool encapsulates the logic from `HoverProvider`, which finds a symbol in `GlobalSymbolIndex` and then reads the source file to extract its signature.
- Reference: `src/codexlens/lsp/handlers.py:285` (instantiates `HoverProvider`)
- Reference: `src/codexlens/lsp/providers.py:53` (in `HoverProvider.get_hover_info`)
**Schema**:
```json
{
"name": "code.symbol.getHoverInfo",
"description": "Retrieves rich information for a symbol, including its signature and location, suitable for displaying in a hover card.",
"inputSchema": {
"type": "object",
"properties": {
"symbol_name": {
"type": "string",
"description": "The exact name of the symbol to get hover information for."
}
},
"required": ["symbol_name"]
}
}
```
**Returns**:
```typescript
{
name: string; // The name of the symbol
kind: string; // The kind of the symbol
signature: string; // The full code signature as extracted from source
file_path: string; // The absolute path to the file
start_line: number; // The 1-based starting line number
} | null // null if symbol not found
```
---
## Integration with CCW MCP Manager
The `codex-lens-tools` MCP server should be added to the recommended MCP servers list in `ccw/src/templates/dashboard-js/components/mcp-manager.js`:
```javascript
{
id: 'codex-lens-tools',
nameKey: 'mcp.codexLens.name',
descKey: 'mcp.codexLens.desc',
icon: 'search-code',
category: 'code-intelligence',
fields: [
{
key: 'toolSelection',
labelKey: 'mcp.codexLens.field.tools',
type: 'multi-select',
options: [
{ value: 'symbol.search', label: 'Symbol Search' },
{ value: 'symbol.findDefinition', label: 'Find Definition' },
{ value: 'symbol.findReferences', label: 'Find References' },
{ value: 'symbol.getHoverInfo', label: 'Hover Information' }
],
default: ['symbol.search', 'symbol.findDefinition', 'symbol.findReferences'],
required: true,
descKey: 'mcp.codexLens.field.tools.desc'
}
],
buildConfig: (values) => {
const tools = values.toolSelection || [];
const env = { CODEXLENS_ENABLED_TOOLS: tools.join(',') };
return buildCrossPlatformMcpConfig('npx', ['-y', 'codex-lens-mcp'], { env });
}
}
```
## Tool Naming Convention
- **Namespace**: `code.*` for code intelligence tools
- **Category**: `symbol` for symbol-related operations
- **Operation**: Descriptive verb (search, findDefinition, findReferences, getHoverInfo)
- **Full Pattern**: `code.symbol.<operation>`
This naming scheme aligns with MCP conventions and is easily extensible for future categories (e.g., `code.types.*`, `code.imports.*`).
## Future Enhancements
1. **Document Symbol Tool** (`code.symbol.getDocumentSymbols`)
- Maps LSP `textDocument/documentSymbol`
- Returns all symbols in a specific file
2. **Type Information** (`code.type.*` group)
- Type definitions and relationships
- Generic resolution
3. **Relationship Analysis** (`code.relation.*` group)
- Call hierarchy
- Inheritance chains
- Import dependencies
---
Generated: 2026-01-19
Status: Ready for Implementation

View File

@@ -1,220 +0,0 @@
# Migration 005: Database Schema Cleanup
## Overview
Migration 005 removes four unused and redundant database fields identified through Gemini analysis. This cleanup improves database efficiency, reduces schema complexity, and eliminates potential data consistency issues.
## Schema Version
- **Previous Version**: 4
- **New Version**: 5
## Changes Summary
### 1. Removed `semantic_metadata.keywords` Column
**Reason**: Deprecated - replaced by normalized `file_keywords` table in migration 001.
**Impact**:
- Keywords are now exclusively read from the normalized `file_keywords` table
- Prevents data sync issues between JSON column and normalized tables
- No data loss - migration 001 already populated `file_keywords` table
**Modified Code**:
- `get_semantic_metadata()`: Now reads keywords from `file_keywords` JOIN
- `list_semantic_metadata()`: Updated to query `file_keywords` for each result
- `add_semantic_metadata()`: Stopped writing to `keywords` column (only writes to `file_keywords`)
### 2. Removed `symbols.token_count` Column
**Reason**: Unused - always NULL, never populated.
**Impact**:
- No data loss (column was never used)
- Reduces symbols table size
- Simplifies symbol insertion logic
**Modified Code**:
- `add_file()`: Removed `token_count` from INSERT statements
- `update_file_symbols()`: Removed `token_count` from INSERT statements
- Schema creation: No longer creates `token_count` column
### 3. Removed `symbols.symbol_type` Column
**Reason**: Redundant - duplicates `symbols.kind` field.
**Impact**:
- No data loss (information preserved in `kind` column)
- Reduces symbols table size
- Eliminates redundant data storage
**Modified Code**:
- `add_file()`: Removed `symbol_type` from INSERT statements
- `update_file_symbols()`: Removed `symbol_type` from INSERT statements
- Schema creation: No longer creates `symbol_type` column
- Removed `idx_symbols_type` index
### 4. Removed `subdirs.direct_files` Column
**Reason**: Unused - never displayed or queried in application logic.
**Impact**:
- No data loss (column was never used)
- Reduces subdirs table size
- Simplifies subdirectory registration
**Modified Code**:
- `register_subdir()`: Parameter kept for backward compatibility but ignored
- `update_subdir_stats()`: Parameter kept for backward compatibility but ignored
- `get_subdirs()`: No longer retrieves `direct_files`
- `get_subdir()`: No longer retrieves `direct_files`
- `SubdirLink` dataclass: Removed `direct_files` field
## Migration Process
### Automatic Migration (v4 → v5)
When an existing database (version 4) is opened:
1. **Transaction begins**
2. **Step 1**: Recreate `semantic_metadata` table without `keywords` column
- Data copied from old table (excluding `keywords`)
- Old table dropped, new table renamed
3. **Step 2**: Recreate `symbols` table without `token_count` and `symbol_type`
- Data copied from old table (excluding removed columns)
- Old table dropped, new table renamed
- Indexes recreated (excluding `idx_symbols_type`)
4. **Step 3**: Recreate `subdirs` table without `direct_files`
- Data copied from old table (excluding `direct_files`)
- Old table dropped, new table renamed
5. **Transaction committed**
6. **VACUUM** runs to reclaim space (non-critical, continues if fails)
### New Database Creation (v5)
New databases are created directly with the clean schema (no migration needed).
## Benefits
1. **Reduced Database Size**: Removed 4 unused columns across 3 tables
2. **Improved Data Consistency**: Single source of truth for keywords (normalized tables)
3. **Simpler Code**: Less maintenance burden for unused fields
4. **Better Performance**: Smaller table sizes, fewer indexes to maintain
5. **Cleaner Schema**: Easier to understand and maintain
## Backward Compatibility
### API Compatibility
All public APIs remain backward compatible:
- `register_subdir()` and `update_subdir_stats()` still accept `direct_files` parameter (ignored)
- `SubdirLink` dataclass no longer has `direct_files` attribute (breaking change for direct dataclass access)
### Database Compatibility
- **v4 databases**: Automatically migrated to v5 on first access
- **v5 databases**: No migration needed
- **Older databases (v0-v3)**: Migrate through chain (v0→v2→v4→v5)
## Testing
Comprehensive test suite added: `tests/test_schema_cleanup_migration.py`
**Test Coverage**:
- ✅ Migration from v4 to v5
- ✅ New database creation with clean schema
- ✅ Semantic metadata keywords read from normalized table
- ✅ Symbols insert without deprecated fields
- ✅ Subdir operations without `direct_files`
**Test Results**: All 5 tests passing
## Verification
To verify migration success:
```python
from codexlens.storage.dir_index import DirIndexStore
store = DirIndexStore("path/to/_index.db")
store.initialize()
# Check schema version
conn = store._get_connection()
version = conn.execute("PRAGMA user_version").fetchone()[0]
assert version == 5
# Check columns removed
cursor = conn.execute("PRAGMA table_info(semantic_metadata)")
columns = {row[1] for row in cursor.fetchall()}
assert "keywords" not in columns
cursor = conn.execute("PRAGMA table_info(symbols)")
columns = {row[1] for row in cursor.fetchall()}
assert "token_count" not in columns
assert "symbol_type" not in columns
cursor = conn.execute("PRAGMA table_info(subdirs)")
columns = {row[1] for row in cursor.fetchall()}
assert "direct_files" not in columns
store.close()
```
## Performance Impact
**Expected Improvements**:
- Database size reduction: ~10-15% (varies by data)
- VACUUM reclaims space immediately after migration
- Slightly faster queries (smaller tables, fewer indexes)
## Rollback
Migration 005 is **one-way** (no downgrade function). Removed fields contain:
- `keywords`: Already migrated to normalized tables (migration 001)
- `token_count`: Always NULL (no data)
- `symbol_type`: Duplicate of `kind` (no data loss)
- `direct_files`: Never used (no data)
If rollback is needed, restore from backup before running migration.
## Files Modified
1. **Migration File**:
- `src/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py` (NEW)
2. **Core Storage**:
- `src/codexlens/storage/dir_index.py`:
- Updated `SCHEMA_VERSION` to 5
- Added migration 005 to `_apply_migrations()`
- Updated `get_semantic_metadata()` to read from `file_keywords`
- Updated `list_semantic_metadata()` to read from `file_keywords`
- Updated `add_semantic_metadata()` to not write `keywords` column
- Updated `add_file()` to not write `token_count`/`symbol_type`
- Updated `update_file_symbols()` to not write `token_count`/`symbol_type`
- Updated `register_subdir()` to not write `direct_files`
- Updated `update_subdir_stats()` to not write `direct_files`
- Updated `get_subdirs()` to not read `direct_files`
- Updated `get_subdir()` to not read `direct_files`
- Updated `SubdirLink` dataclass to remove `direct_files`
- Updated `_create_schema()` to create v5 schema directly
3. **Tests**:
- `tests/test_schema_cleanup_migration.py` (NEW)
## Deployment Checklist
- [x] Migration script created and tested
- [x] Schema version updated to 5
- [x] All code updated to use new schema
- [x] Comprehensive tests added
- [x] Existing tests pass
- [x] Documentation updated
- [x] Backward compatibility verified
## References
- Original Analysis: Gemini code review identified unused/redundant fields
- Migration Pattern: Follows SQLite best practices (table recreation)
- Previous Migrations: 001 (keywords normalization), 004 (dual FTS)

View File

@@ -1,973 +0,0 @@
# 多层次分词器设计方案
## 1. 背景与目标
### 1.1 当前问题
当前 `chunker.py` 的两种分词策略存在明显缺陷:
**symbol-based 策略**
- ✅ 优点保持代码逻辑完整性每个chunk是完整的函数/类
- ❌ 缺点粒度不均超大函数可能达到数百行影响LLM处理和搜索精度
**sliding-window 策略**
- ✅ 优点chunk大小均匀覆盖全面
- ❌ 缺点:破坏逻辑结构,可能将完整的循环/条件块切断
### 1.2 设计目标
实现多层次分词器,同时满足:
1. **语义完整性**:保持代码逻辑边界的完整性
2. **粒度可控**:支持从粗粒度(函数级)到细粒度(逻辑块级)的灵活划分
3. **层级关系**保留chunk之间的父子关系支持上下文检索
4. **高效索引**:优化向量化和检索性能
## 2. 技术架构
### 2.1 两层分词架构
```
Source Code
[Layer 1: Symbol-Level Chunking] ← 使用 tree-sitter AST
MacroChunks (Functions/Classes)
[Layer 2: Logic-Block Chunking] ← AST深度遍历
MicroChunks (Loops/Conditionals/Blocks)
Vector Embedding + Indexing
```
### 2.2 核心组件
```python
# 新增数据结构
@dataclass
class ChunkMetadata:
"""Chunk元数据"""
chunk_id: str
parent_id: Optional[str] # 父chunk ID
level: int # 层级1=macro, 2=micro
chunk_type: str # function/class/loop/conditional/try_except
file_path: str
start_line: int
end_line: int
symbol_name: Optional[str]
context_summary: Optional[str] # 继承自父chunk的上下文
@dataclass
class HierarchicalChunk:
"""层级化的代码块"""
metadata: ChunkMetadata
content: str
embedding: Optional[List[float]] = None
children: List['HierarchicalChunk'] = field(default_factory=list)
```
## 3. 详细实现步骤
### 3.1 第一层符号级分词Macro-Chunking
**实现思路**:复用现有 `code_extractor.py` 逻辑,增强元数据提取。
```python
class MacroChunker:
"""第一层分词器:提取顶层符号"""
def __init__(self):
self.parser = Parser()
# 加载语言grammar
def chunk_by_symbols(
self,
content: str,
file_path: str,
language: str
) -> List[HierarchicalChunk]:
"""提取顶层函数和类定义"""
tree = self.parser.parse(bytes(content, 'utf-8'))
root_node = tree.root_node
chunks = []
for node in root_node.children:
if node.type in ['function_definition', 'class_definition',
'method_definition']:
chunk = self._create_macro_chunk(node, content, file_path)
chunks.append(chunk)
return chunks
def _create_macro_chunk(
self,
node,
content: str,
file_path: str
) -> HierarchicalChunk:
"""从AST节点创建macro chunk"""
start_line = node.start_point[0] + 1
end_line = node.end_point[0] + 1
# 提取符号名称
name_node = node.child_by_field_name('name')
symbol_name = content[name_node.start_byte:name_node.end_byte]
# 提取完整代码包含docstring和装饰器
chunk_content = self._extract_with_context(node, content)
metadata = ChunkMetadata(
chunk_id=f"{file_path}:{start_line}",
parent_id=None,
level=1,
chunk_type=node.type,
file_path=file_path,
start_line=start_line,
end_line=end_line,
symbol_name=symbol_name,
)
return HierarchicalChunk(
metadata=metadata,
content=chunk_content,
)
def _extract_with_context(self, node, content: str) -> str:
"""提取代码包含装饰器和docstring"""
# 向上查找装饰器
start_byte = node.start_byte
prev_sibling = node.prev_sibling
while prev_sibling and prev_sibling.type == 'decorator':
start_byte = prev_sibling.start_byte
prev_sibling = prev_sibling.prev_sibling
return content[start_byte:node.end_byte]
```
### 3.2 第二层逻辑块分词Micro-Chunking
**实现思路**在每个macro chunk内部按逻辑结构进一步划分。
```python
class MicroChunker:
"""第二层分词器:提取逻辑块"""
# 需要划分的逻辑块类型
LOGIC_BLOCK_TYPES = {
'for_statement',
'while_statement',
'if_statement',
'try_statement',
'with_statement',
}
def chunk_logic_blocks(
self,
macro_chunk: HierarchicalChunk,
content: str,
max_lines: int = 50 # 大于此行数的macro chunk才进行二次划分
) -> List[HierarchicalChunk]:
"""在macro chunk内部提取逻辑块"""
# 小函数不需要二次划分
total_lines = macro_chunk.metadata.end_line - macro_chunk.metadata.start_line
if total_lines <= max_lines:
return []
tree = self.parser.parse(bytes(macro_chunk.content, 'utf-8'))
root_node = tree.root_node
micro_chunks = []
self._traverse_logic_blocks(
root_node,
macro_chunk,
content,
micro_chunks
)
return micro_chunks
def _traverse_logic_blocks(
self,
node,
parent_chunk: HierarchicalChunk,
content: str,
result: List[HierarchicalChunk]
):
"""递归遍历AST提取逻辑块"""
if node.type in self.LOGIC_BLOCK_TYPES:
micro_chunk = self._create_micro_chunk(
node,
parent_chunk,
content
)
result.append(micro_chunk)
parent_chunk.children.append(micro_chunk)
# 继续遍历子节点
for child in node.children:
self._traverse_logic_blocks(child, parent_chunk, content, result)
def _create_micro_chunk(
self,
node,
parent_chunk: HierarchicalChunk,
content: str
) -> HierarchicalChunk:
"""创建micro chunk"""
# 计算相对于文件的行号
start_line = parent_chunk.metadata.start_line + node.start_point[0]
end_line = parent_chunk.metadata.start_line + node.end_point[0]
chunk_content = content[node.start_byte:node.end_byte]
metadata = ChunkMetadata(
chunk_id=f"{parent_chunk.metadata.chunk_id}:L{start_line}",
parent_id=parent_chunk.metadata.chunk_id,
level=2,
chunk_type=node.type,
file_path=parent_chunk.metadata.file_path,
start_line=start_line,
end_line=end_line,
symbol_name=parent_chunk.metadata.symbol_name, # 继承父符号名
context_summary=None, # 后续由LLM填充
)
return HierarchicalChunk(
metadata=metadata,
content=chunk_content,
)
```
### 3.3 统一接口:多层次分词器
```python
class HierarchicalChunker:
"""多层次分词器统一接口"""
def __init__(self, config: ChunkConfig = None):
self.config = config or ChunkConfig()
self.macro_chunker = MacroChunker()
self.micro_chunker = MicroChunker()
def chunk_file(
self,
content: str,
file_path: str,
language: str
) -> List[HierarchicalChunk]:
"""对文件进行多层次分词"""
# 第一层:符号级分词
macro_chunks = self.macro_chunker.chunk_by_symbols(
content, file_path, language
)
# 第二层:逻辑块分词
all_chunks = []
for macro_chunk in macro_chunks:
all_chunks.append(macro_chunk)
# 对大函数进行二次划分
micro_chunks = self.micro_chunker.chunk_logic_blocks(
macro_chunk, content
)
all_chunks.extend(micro_chunks)
return all_chunks
def chunk_file_with_fallback(
self,
content: str,
file_path: str,
language: str
) -> List[HierarchicalChunk]:
"""带降级策略的分词"""
try:
return self.chunk_file(content, file_path, language)
except Exception as e:
logger.warning(f"Hierarchical chunking failed: {e}, falling back to sliding window")
# 降级到滑动窗口策略
return self._fallback_sliding_window(content, file_path, language)
```
## 4. 数据存储设计
### 4.1 数据库Schema
```sql
-- chunk表存储所有层级的chunk
CREATE TABLE chunks (
chunk_id TEXT PRIMARY KEY,
parent_id TEXT, -- 父chunk IDNULL表示顶层
level INTEGER NOT NULL, -- 1=macro, 2=micro
chunk_type TEXT NOT NULL, -- function/class/loop/if/try等
file_path TEXT NOT NULL,
start_line INTEGER NOT NULL,
end_line INTEGER NOT NULL,
symbol_name TEXT,
content TEXT NOT NULL,
content_hash TEXT, -- 用于检测内容变化
-- 语义元数据由LLM生成
summary TEXT,
keywords TEXT, -- JSON数组
purpose TEXT,
-- 向量嵌入
embedding BLOB, -- 存储向量
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (parent_id) REFERENCES chunks(chunk_id) ON DELETE CASCADE
);
-- 索引优化
CREATE INDEX idx_chunks_file_path ON chunks(file_path);
CREATE INDEX idx_chunks_parent_id ON chunks(parent_id);
CREATE INDEX idx_chunks_level ON chunks(level);
CREATE INDEX idx_chunks_symbol_name ON chunks(symbol_name);
```
### 4.2 向量索引
使用分层索引策略:
```python
class HierarchicalVectorStore:
"""层级化向量存储"""
def __init__(self, db_path: Path):
self.db_path = db_path
self.conn = sqlite3.connect(db_path)
def add_chunk(self, chunk: HierarchicalChunk):
"""添加chunk及其向量"""
cursor = self.conn.cursor()
cursor.execute("""
INSERT INTO chunks (
chunk_id, parent_id, level, chunk_type,
file_path, start_line, end_line, symbol_name,
content, embedding
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
chunk.metadata.chunk_id,
chunk.metadata.parent_id,
chunk.metadata.level,
chunk.metadata.chunk_type,
chunk.metadata.file_path,
chunk.metadata.start_line,
chunk.metadata.end_line,
chunk.metadata.symbol_name,
chunk.content,
self._serialize_embedding(chunk.embedding),
))
self.conn.commit()
def search_hierarchical(
self,
query_embedding: List[float],
top_k: int = 10,
level_weights: Dict[int, float] = None
) -> List[Tuple[HierarchicalChunk, float]]:
"""层级化检索"""
# 默认权重macro chunk权重更高
if level_weights is None:
level_weights = {1: 1.0, 2: 0.8}
# 检索所有chunk
cursor = self.conn.cursor()
cursor.execute("SELECT * FROM chunks WHERE embedding IS NOT NULL")
results = []
for row in cursor.fetchall():
chunk = self._row_to_chunk(row)
similarity = self._cosine_similarity(
query_embedding,
chunk.embedding
)
# 根据层级应用权重
weighted_score = similarity * level_weights.get(chunk.metadata.level, 1.0)
results.append((chunk, weighted_score))
# 按分数排序
results.sort(key=lambda x: x[1], reverse=True)
return results[:top_k]
def get_chunk_with_context(
self,
chunk_id: str
) -> Tuple[HierarchicalChunk, Optional[HierarchicalChunk]]:
"""获取chunk及其父chunk提供上下文"""
cursor = self.conn.cursor()
# 获取chunk本身
cursor.execute("SELECT * FROM chunks WHERE chunk_id = ?", (chunk_id,))
chunk_row = cursor.fetchone()
chunk = self._row_to_chunk(chunk_row)
# 获取父chunk
parent = None
if chunk.metadata.parent_id:
cursor.execute(
"SELECT * FROM chunks WHERE chunk_id = ?",
(chunk.metadata.parent_id,)
)
parent_row = cursor.fetchone()
if parent_row:
parent = self._row_to_chunk(parent_row)
return chunk, parent
```
## 5. LLM集成策略
### 5.1 分层生成语义元数据
```python
class HierarchicalLLMEnhancer:
"""为层级chunk生成语义元数据"""
def enhance_hierarchical_chunks(
self,
chunks: List[HierarchicalChunk]
) -> Dict[str, SemanticMetadata]:
"""
分层处理策略:
1. 先处理所有level=1的macro chunks生成详细摘要
2. 再处理level=2的micro chunks使用父chunk摘要作为上下文
"""
results = {}
# 第一轮处理macro chunks
macro_chunks = [c for c in chunks if c.metadata.level == 1]
macro_metadata = self.llm_enhancer.enhance_files([
FileData(
path=c.metadata.chunk_id,
content=c.content,
language=self._detect_language(c.metadata.file_path)
)
for c in macro_chunks
])
results.update(macro_metadata)
# 第二轮处理micro chunks带父上下文
micro_chunks = [c for c in chunks if c.metadata.level == 2]
for micro_chunk in micro_chunks:
parent_id = micro_chunk.metadata.parent_id
parent_summary = macro_metadata.get(parent_id, {}).get('summary', '')
# 构建带上下文的prompt
enhanced_prompt = f"""
Parent Function: {micro_chunk.metadata.symbol_name}
Parent Summary: {parent_summary}
Code Block ({micro_chunk.metadata.chunk_type}):
```
{micro_chunk.content}
```
Generate a concise summary (1 sentence) and keywords for this specific code block.
"""
metadata = self._call_llm_with_context(enhanced_prompt)
results[micro_chunk.metadata.chunk_id] = metadata
return results
```
### 5.2 Prompt优化
针对不同层级使用不同的prompt模板
**Macro Chunk Prompt (Level 1)**:
```
PURPOSE: Generate comprehensive semantic metadata for a complete function/class
TASK:
- Provide a detailed summary (2-3 sentences) covering what the code does and why
- Extract 8-12 relevant keywords including technical terms and domain concepts
- Identify the primary purpose/category
MODE: analysis
CODE:
```{language}
{content}
```
OUTPUT: JSON with summary, keywords, purpose
```
**Micro Chunk Prompt (Level 2)**:
```
PURPOSE: Summarize a specific logic block within a larger function
CONTEXT:
- Parent Function: {symbol_name}
- Parent Purpose: {parent_summary}
TASK:
- Provide a brief summary (1 sentence) of this specific block's role in the parent function
- Extract 3-5 keywords specific to this block's logic
MODE: analysis
CODE BLOCK ({chunk_type}):
```{language}
{content}
```
OUTPUT: JSON with summary, keywords
```
## 6. 检索增强
### 6.1 上下文扩展检索
```python
class ContextualSearchEngine:
"""支持上下文扩展的检索引擎"""
def search_with_context(
self,
query: str,
top_k: int = 10,
expand_context: bool = True
) -> List[SearchResult]:
"""
检索并自动扩展上下文
如果匹配到micro chunk自动返回其父macro chunk作为上下文
"""
# 生成查询向量
query_embedding = self.embedder.embed_single(query)
# 层级化检索
raw_results = self.vector_store.search_hierarchical(
query_embedding,
top_k=top_k
)
# 扩展上下文
enriched_results = []
for chunk, score in raw_results:
result = SearchResult(
path=chunk.metadata.file_path,
score=score,
content=chunk.content,
start_line=chunk.metadata.start_line,
end_line=chunk.metadata.end_line,
symbol_name=chunk.metadata.symbol_name,
)
# 如果是micro chunk获取父chunk作为上下文
if expand_context and chunk.metadata.level == 2:
parent_chunk, _ = self.vector_store.get_chunk_with_context(
chunk.metadata.chunk_id
)
if parent_chunk:
result.metadata['parent_context'] = {
'summary': parent_chunk.metadata.context_summary,
'symbol_name': parent_chunk.metadata.symbol_name,
'content': parent_chunk.content,
}
enriched_results.append(result)
return enriched_results
```
## 7. 测试策略
### 7.1 单元测试
```python
import pytest
from codexlens.semantic.hierarchical_chunker import (
HierarchicalChunker, MacroChunker, MicroChunker
)
class TestMacroChunker:
"""测试第一层分词"""
def test_extract_functions(self):
"""测试提取函数定义"""
code = '''
def calculate_total(items):
"""Calculate total price."""
total = 0
for item in items:
total += item.price
return total
def apply_discount(total, discount):
"""Apply discount to total."""
return total * (1 - discount)
'''
chunker = MacroChunker()
chunks = chunker.chunk_by_symbols(code, 'test.py', 'python')
assert len(chunks) == 2
assert chunks[0].metadata.symbol_name == 'calculate_total'
assert chunks[1].metadata.symbol_name == 'apply_discount'
assert chunks[0].metadata.level == 1
def test_extract_with_decorators(self):
"""测试提取带装饰器的函数"""
code = '''
@app.route('/api/users')
@auth_required
def get_users():
return User.query.all()
'''
chunker = MacroChunker()
chunks = chunker.chunk_by_symbols(code, 'test.py', 'python')
assert len(chunks) == 1
assert '@app.route' in chunks[0].content
assert '@auth_required' in chunks[0].content
class TestMicroChunker:
"""测试第二层分词"""
def test_extract_loop_blocks(self):
"""测试提取循环块"""
code = '''
def process_items(items):
results = []
for item in items:
if item.active:
results.append(process(item))
return results
'''
macro_chunker = MacroChunker()
macro_chunks = macro_chunker.chunk_by_symbols(code, 'test.py', 'python')
micro_chunker = MicroChunker()
micro_chunks = micro_chunker.chunk_logic_blocks(
macro_chunks[0], code
)
# 应该提取出for循环和if条件块
assert len(micro_chunks) >= 1
assert any(c.metadata.chunk_type == 'for_statement' for c in micro_chunks)
def test_skip_small_functions(self):
"""测试小函数跳过二次划分"""
code = '''
def small_func(x):
return x * 2
'''
macro_chunker = MacroChunker()
macro_chunks = macro_chunker.chunk_by_symbols(code, 'test.py', 'python')
micro_chunker = MicroChunker()
micro_chunks = micro_chunker.chunk_logic_blocks(
macro_chunks[0], code, max_lines=10
)
# 小函数不应该被二次划分
assert len(micro_chunks) == 0
class TestHierarchicalChunker:
"""测试完整的多层次分词"""
def test_full_hierarchical_chunking(self):
"""测试完整的层级分词流程"""
code = '''
def complex_function(data):
"""A complex function with multiple logic blocks."""
# Validation
if not data:
raise ValueError("Data is empty")
# Processing
results = []
for item in data:
try:
processed = process_item(item)
results.append(processed)
except Exception as e:
logger.error(f"Failed to process: {e}")
continue
# Aggregation
total = sum(r.value for r in results)
return total
'''
chunker = HierarchicalChunker()
chunks = chunker.chunk_file(code, 'test.py', 'python')
# 应该有1个macro chunk和多个micro chunks
macro_chunks = [c for c in chunks if c.metadata.level == 1]
micro_chunks = [c for c in chunks if c.metadata.level == 2]
assert len(macro_chunks) == 1
assert len(micro_chunks) > 0
# 验证父子关系
for micro in micro_chunks:
assert micro.metadata.parent_id == macro_chunks[0].metadata.chunk_id
```
### 7.2 集成测试
```python
class TestHierarchicalIndexing:
"""测试完整的索引流程"""
def test_index_and_search(self):
"""测试分层索引和检索"""
# 1. 分词
chunker = HierarchicalChunker()
chunks = chunker.chunk_file(sample_code, 'sample.py', 'python')
# 2. LLM增强
enhancer = HierarchicalLLMEnhancer()
metadata = enhancer.enhance_hierarchical_chunks(chunks)
# 3. 向量化
embedder = Embedder()
for chunk in chunks:
text = metadata[chunk.metadata.chunk_id].summary
chunk.embedding = embedder.embed_single(text)
# 4. 存储
vector_store = HierarchicalVectorStore(Path('/tmp/test.db'))
for chunk in chunks:
vector_store.add_chunk(chunk)
# 5. 检索
search_engine = ContextualSearchEngine(vector_store, embedder)
results = search_engine.search_with_context(
"find loop that processes items",
top_k=5
)
# 验证结果
assert len(results) > 0
assert any(r.metadata.get('parent_context') for r in results)
```
## 8. 性能优化
### 8.1 批量处理
```python
class BatchHierarchicalProcessor:
"""批量处理多个文件的层级分词"""
def process_files_batch(
self,
file_paths: List[Path],
batch_size: int = 10
):
"""批量处理优化LLM调用"""
all_chunks = []
# 1. 批量分词
for file_path in file_paths:
content = file_path.read_text()
chunks = self.chunker.chunk_file(
content, str(file_path), self._detect_language(file_path)
)
all_chunks.extend(chunks)
# 2. 批量LLM增强减少API调用
macro_chunks = [c for c in all_chunks if c.metadata.level == 1]
for i in range(0, len(macro_chunks), batch_size):
batch = macro_chunks[i:i+batch_size]
self.enhancer.enhance_batch(batch)
# 3. 批量向量化
all_texts = [c.content for c in all_chunks]
embeddings = self.embedder.embed_batch(all_texts)
for chunk, embedding in zip(all_chunks, embeddings):
chunk.embedding = embedding
# 4. 批量存储
self.vector_store.add_chunks_batch(all_chunks)
```
### 8.2 增量更新
```python
class IncrementalIndexer:
"""增量索引器:只处理变化的文件"""
def update_file(self, file_path: Path):
"""增量更新单个文件"""
content = file_path.read_text()
content_hash = hashlib.sha256(content.encode()).hexdigest()
# 检查文件是否变化
cursor = self.conn.cursor()
cursor.execute("""
SELECT content_hash FROM chunks
WHERE file_path = ? AND level = 1
LIMIT 1
""", (str(file_path),))
row = cursor.fetchone()
if row and row[0] == content_hash:
logger.info(f"File {file_path} unchanged, skipping")
return
# 删除旧chunk
cursor.execute("DELETE FROM chunks WHERE file_path = ?", (str(file_path),))
# 重新索引
chunks = self.chunker.chunk_file(content, str(file_path), 'python')
# ... 继续处理
```
## 9. 潜在问题与解决方案
### 9.1 问题超大函数的micro chunk过多
**现象**某些遗留代码函数超过1000行可能产生几十个micro chunks。
**解决方案**
```python
class AdaptiveMicroChunker:
"""自适应micro分词根据函数大小调整策略"""
def chunk_logic_blocks(self, macro_chunk, content):
total_lines = macro_chunk.metadata.end_line - macro_chunk.metadata.start_line
if total_lines > 500:
# 超大函数:只提取顶层逻辑块,不递归
return self._extract_top_level_blocks(macro_chunk, content)
elif total_lines > 100:
# 大函数递归深度限制为2层
return self._extract_blocks_with_depth_limit(macro_chunk, content, max_depth=2)
else:
# 正常函数完全跳过micro chunking
return []
```
### 9.2 问题tree-sitter解析失败
**现象**对于语法错误的代码tree-sitter解析可能失败。
**解决方案**
```python
def chunk_file_with_fallback(self, content, file_path, language):
"""带降级策略的分词"""
try:
# 尝试层级分词
return self.chunk_file(content, file_path, language)
except TreeSitterError as e:
logger.warning(f"Tree-sitter parsing failed: {e}")
# 降级到基于正则的简单symbol提取
return self._fallback_regex_chunking(content, file_path)
except Exception as e:
logger.error(f"Chunking failed completely: {e}")
# 最终降级到滑动窗口
return self._fallback_sliding_window(content, file_path, language)
```
### 9.3 问题:向量存储空间占用
**现象**每个chunk都存储向量空间占用可能很大。
**解决方案**
- **选择性向量化**只对macro chunks和重要的micro chunks生成向量
- **向量压缩**使用PCA或量化技术减少向量维度
- **分离存储**向量存储在专门的向量数据库如FaissSQLite只存元数据
```python
class SelectiveVectorization:
"""选择性向量化:减少存储开销"""
VECTORIZE_CHUNK_TYPES = {
'function_definition', # 总是向量化
'class_definition', # 总是向量化
'for_statement', # 循环块
'try_statement', # 异常处理
# 'if_statement' 通常不单独向量化依赖父chunk
}
def should_vectorize(self, chunk: HierarchicalChunk) -> bool:
"""判断是否需要为chunk生成向量"""
# Level 1总是向量化
if chunk.metadata.level == 1:
return True
# Level 2根据类型和大小决定
if chunk.metadata.chunk_type not in self.VECTORIZE_CHUNK_TYPES:
return False
# 太小的块(<5行不向量化
lines = chunk.metadata.end_line - chunk.metadata.start_line
if lines < 5:
return False
return True
```
## 10. 实施路线图
### Phase 1: 基础架构2-3周
- [x] 设计数据结构HierarchicalChunk, ChunkMetadata
- [ ] 实现MacroChunker复用现有code_extractor
- [ ] 实现基础的MicroChunker
- [ ] 数据库schema设计和migration
- [ ] 单元测试
### Phase 2: LLM集成1-2周
- [ ] 实现HierarchicalLLMEnhancer
- [ ] 设计分层prompt模板
- [ ] 批量处理优化
- [ ] 集成测试
### Phase 3: 向量化与检索1-2周
- [ ] 实现HierarchicalVectorStore
- [ ] 实现ContextualSearchEngine
- [ ] 上下文扩展逻辑
- [ ] 检索性能测试
### Phase 4: 优化与完善2周
- [ ] 性能优化(批量处理、增量更新)
- [ ] 降级策略完善
- [ ] 选择性向量化
- [ ] 全面测试和文档
### Phase 5: 生产部署1周
- [ ] CLI集成
- [ ] 配置选项暴露
- [ ] 生产环境测试
- [ ] 发布
**总计预估时间**7-10周
## 11. 成功指标
1. **覆盖率**95%以上的代码能被正确分词
2. **准确率**:层级关系准确率>98%
3. **检索质量**相比单层分词检索相关性提升30%+
4. **性能**:单文件分词<100ms批量处理>100文件/分钟
5. **存储效率**相比全向量化空间占用减少40%+
## 12. 参考资料
- [Tree-sitter Documentation](https://tree-sitter.github.io/)
- [AST-based Code Analysis](https://en.wikipedia.org/wiki/Abstract_syntax_tree)
- [Hierarchical Text Segmentation](https://arxiv.org/abs/2104.08836)
- 现有代码:`src/codexlens/semantic/chunker.py`

View File

@@ -1,417 +0,0 @@
# Pure Vector Search 使用指南
## 概述
CodexLens 现在支持纯向量语义搜索!这是一个重要的新功能,允许您使用自然语言查询代码。
### 新增搜索模式
| 模式 | 描述 | 最佳用途 | 需要嵌入 |
|------|------|----------|---------|
| `exact` | 精确FTS匹配 | 代码标识符搜索 | ✗ |
| `fuzzy` | 模糊FTS匹配 | 容错搜索 | ✗ |
| `vector` | 向量 + FTS后备 | 语义 + 关键词混合 | ✓ |
| **`pure-vector`** | **纯向量搜索** | **纯自然语言查询** | **✓** |
| `hybrid` | 全部融合(RRF) | 最佳召回率 | ✓ |
### 关键变化
**之前**
```bash
# "vector"模式实际上总是包含exact FTS搜索
codexlens search "authentication" --mode vector
# 即使没有嵌入也会返回FTS结果
```
**现在**
```bash
# "vector"模式仍保持向量+FTS混合向后兼容
codexlens search "authentication" --mode vector
# 新的"pure-vector"模式:仅使用向量搜索
codexlens search "how to authenticate users" --mode pure-vector
# 没有嵌入时返回空列表(明确行为)
```
## 快速开始
### 步骤1安装语义搜索依赖
```bash
# 方式1使用可选依赖
pip install codexlens[semantic]
# 方式2手动安装
pip install fastembed numpy
```
### 步骤2创建索引如果还没有
```bash
# 为项目创建索引
codexlens init ~/projects/your-project
```
### 步骤3生成向量嵌入
```bash
# 为项目生成嵌入(自动查找索引)
codexlens embeddings-generate ~/projects/your-project
# 为特定索引生成嵌入
codexlens embeddings-generate ~/.codexlens/indexes/your-project/_index.db
# 使用特定模型
codexlens embeddings-generate ~/projects/your-project --model fast
# 强制重新生成
codexlens embeddings-generate ~/projects/your-project --force
# 检查嵌入状态
codexlens embeddings-status # 检查所有索引
codexlens embeddings-status ~/projects/your-project # 检查特定项目
```
**可用模型**
- `fast`: BAAI/bge-small-en-v1.5 (384维, ~80MB) - 快速,轻量级
- `code`: jinaai/jina-embeddings-v2-base-code (768维, ~150MB) - **代码优化**(推荐,默认)
- `multilingual`: intfloat/multilingual-e5-large (1024维, ~1GB) - 多语言
- `balanced`: mixedbread-ai/mxbai-embed-large-v1 (1024维, ~600MB) - 高精度
### 步骤4使用纯向量搜索
```bash
# 纯向量搜索(自然语言)
codexlens search "how to verify user credentials" --mode pure-vector
# 向量搜索带FTS后备
codexlens search "authentication logic" --mode vector
# 混合搜索(最佳效果)
codexlens search "user login" --mode hybrid
# 精确代码搜索
codexlens search "authenticate_user" --mode exact
```
## 使用场景
### 场景1查找实现特定功能的代码
**问题**"我如何在这个项目中处理用户身份验证?"
```bash
codexlens search "verify user credentials and authenticate" --mode pure-vector
```
**优势**:理解查询意图,找到语义相关的代码,而不仅仅是关键词匹配。
### 场景2查找类似的代码模式
**问题**"项目中哪些地方使用了密码哈希?"
```bash
codexlens search "password hashing with salt" --mode pure-vector
```
**优势**:找到即使没有包含"hash"或"password"关键词的相关代码。
### 场景3探索性搜索
**问题**"如何在这个项目中连接数据库?"
```bash
codexlens search "database connection and initialization" --mode pure-vector
```
**优势**:发现相关代码,即使使用了不同的术语(如"DB"、"connection pool"、"session")。
### 场景4混合搜索获得最佳效果
**问题**:既要关键词匹配,又要语义理解
```bash
# 最佳实践使用hybrid模式
codexlens search "authentication" --mode hybrid
```
**优势**结合FTS的精确性和向量搜索的语义理解。
## 故障排除
### 问题1纯向量搜索返回空结果
**原因**:未生成向量嵌入
**解决方案**
```bash
# 检查嵌入状态
codexlens embeddings-status ~/projects/your-project
# 生成嵌入
codexlens embeddings-generate ~/projects/your-project
# 或者对特定索引
codexlens embeddings-generate ~/.codexlens/indexes/your-project/_index.db
```
### 问题2ImportError: fastembed not found
**原因**:未安装语义搜索依赖
**解决方案**
```bash
pip install codexlens[semantic]
```
### 问题3嵌入生成失败
**原因**:模型下载失败或磁盘空间不足
**解决方案**
```bash
# 使用更小的模型
codexlens embeddings-generate ~/projects/your-project --model fast
# 检查磁盘空间(模型需要~100MB
df -h ~/.cache/fastembed
```
### 问题4搜索速度慢
**原因**向量搜索比FTS慢需要计算余弦相似度
**优化**
- 使用`--limit`限制结果数量
- 考虑使用`vector`模式带FTS后备而不是`pure-vector`
- 对于精确标识符搜索,使用`exact`模式
## 性能对比
基于测试数据100个文件~500个代码块
| 模式 | 平均延迟 | 召回率 | 精确率 |
|------|---------|--------|--------|
| exact | 5.6ms | 中 | 高 |
| fuzzy | 7.7ms | 高 | 中 |
| vector | 7.4ms | 高 | 中 |
| **pure-vector** | **7.0ms** | **最高** | **中** |
| hybrid | 9.0ms | 最高 | 高 |
**结论**
- `exact`: 最快,适合代码标识符
- `pure-vector`: 与vector类似速度更明确的语义搜索
- `hybrid`: 轻微开销,但召回率和精确率最佳
## 最佳实践
### 1. 选择合适的搜索模式
```bash
# 查找函数名/类名/变量名 → exact
codexlens search "UserAuthentication" --mode exact
# 自然语言问题 → pure-vector
codexlens search "how to hash passwords securely" --mode pure-vector
# 不确定用哪个 → hybrid
codexlens search "password security" --mode hybrid
```
### 2. 优化查询
**不好的查询**(对向量搜索):
```bash
codexlens search "auth" --mode pure-vector # 太模糊
```
**好的查询**
```bash
codexlens search "authenticate user with username and password" --mode pure-vector
```
**原则**
- 使用完整句子描述意图
- 包含关键动词和名词
- 避免过于简短或模糊的查询
### 3. 定期更新嵌入
```bash
# 当代码更新后,重新生成嵌入
codexlens embeddings-generate ~/projects/your-project --force
```
### 4. 监控嵌入存储空间
```bash
# 检查嵌入数据大小
du -sh ~/.codexlens/indexes/*/
# 嵌入通常占用索引大小的2-3倍
# 100个文件 → ~500个chunks → ~1.5MB (768维向量)
```
## API 使用示例
### Python API
```python
from pathlib import Path
from codexlens.search.hybrid_search import HybridSearchEngine
# 初始化引擎
engine = HybridSearchEngine()
# 纯向量搜索
results = engine.search(
index_path=Path("~/.codexlens/indexes/project/_index.db"),
query="how to authenticate users",
limit=10,
enable_vector=True,
pure_vector=True, # 纯向量模式
)
for result in results:
print(f"{result.path}: {result.score:.3f}")
print(f" {result.excerpt}")
# 向量搜索带FTS后备
results = engine.search(
index_path=Path("~/.codexlens/indexes/project/_index.db"),
query="authentication",
limit=10,
enable_vector=True,
pure_vector=False, # 允许FTS后备
)
```
### 链式搜索API
```python
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
from codexlens.storage.registry import RegistryStore
from codexlens.storage.path_mapper import PathMapper
# 初始化
registry = RegistryStore()
registry.initialize()
mapper = PathMapper()
engine = ChainSearchEngine(registry, mapper)
# 配置搜索选项
options = SearchOptions(
depth=-1, # 无限深度
total_limit=20,
hybrid_mode=True,
enable_vector=True,
pure_vector=True, # 纯向量搜索
)
# 执行搜索
result = engine.search(
query="verify user credentials",
source_path=Path("~/projects/my-app"),
options=options
)
print(f"Found {len(result.results)} results in {result.stats.time_ms:.1f}ms")
```
## 技术细节
### 向量存储架构
```
_index.db (SQLite)
├── files # 文件索引表
├── files_fts # FTS5全文索引
├── files_fts_fuzzy # 模糊搜索索引
└── semantic_chunks # 向量嵌入表 ✓ 新增
├── id
├── file_path
├── content # 代码块内容
├── embedding # 向量嵌入(BLOB, float32)
├── metadata # JSON元数据
└── created_at
```
### 向量搜索流程
```
1. 查询嵌入化
└─ query → Embedder → query_embedding (768维向量)
2. 相似度计算
└─ VectorStore.search_similar()
├─ 加载embedding matrix到内存
├─ NumPy向量化余弦相似度计算
└─ Top-K选择
3. 结果返回
└─ SearchResult对象列表
├─ path: 文件路径
├─ score: 相似度分数
├─ excerpt: 代码片段
└─ metadata: 元数据
```
### RRF融合算法
混合模式使用Reciprocal Rank Fusion (RRF)
```python
# 默认权重
weights = {
"exact": 0.4, # 40% 精确FTS
"fuzzy": 0.3, # 30% 模糊FTS
"vector": 0.3, # 30% 向量搜索
}
# RRF公式
score(doc) = Σ weight[source] / (k + rank[source])
k = 60 # RRF常数
```
## 未来改进
- [ ] 增量嵌入更新(当前需要完全重新生成)
- [ ] 混合分块策略symbol-based + sliding window
- [ ] FAISS加速100x+速度提升)
- [ ] 向量压缩减少50%存储空间)
- [ ] 查询扩展(同义词、相关术语)
- [ ] 多模态搜索(代码 + 文档 + 注释)
## 相关资源
- **实现文件**
- `codexlens/search/hybrid_search.py` - 混合搜索引擎
- `codexlens/semantic/embedder.py` - 嵌入生成
- `codexlens/semantic/vector_store.py` - 向量存储
- `codexlens/semantic/chunker.py` - 代码分块
- **测试文件**
- `tests/test_pure_vector_search.py` - 纯向量搜索测试
- `tests/test_search_comparison.py` - 搜索模式对比
- **文档**
- `SEARCH_COMPARISON_ANALYSIS.md` - 详细技术分析
- `SEARCH_ANALYSIS_SUMMARY.md` - 快速总结
## 反馈和贡献
如果您发现问题或有改进建议请提交issue或PR
- GitHub: https://github.com/your-org/codexlens
## 更新日志
### v0.5.0 (2025-12-16)
- ✨ 新增 `pure-vector` 搜索模式
- ✨ 添加向量嵌入生成脚本
- 🔧 修复"vector"模式总是包含exact FTS的问题
- 📚 更新文档和使用指南
- ✅ 添加纯向量搜索测试套件
---
**问题?** 查看 [故障排除](#故障排除) 章节或提交issue。

View File

@@ -1,825 +0,0 @@
# CodexLens Real LSP Server Implementation Plan
> **Version**: 2.0
> **Status**: Ready for Implementation
> **Based on**: Existing LSP_INTEGRATION_PLAN.md + Real Language Server Integration
> **Goal**: Implement true LSP server functionality (like cclsp), not pre-indexed search
---
## Executive Summary
### Current State vs Target State
| Aspect | Current (Pre-indexed) | Target (Real LSP) |
|--------|----------------------|-------------------|
| **Data Source** | Cached database index | Live language servers |
| **Freshness** | Stale (depends on re-index) | Real-time (LSP protocol) |
| **Accuracy** | Good for indexed content | Perfect (from language server) |
| **Latency** | <50ms (database) | ~50-200ms (LSP) |
| **Language Support** | Limited to parsed symbols | Full LSP support (all languages) |
| **Complexity** | Simple (DB queries) | High (LSP protocol + server mgmt) |
### Why Real LSP vs Index-Based
**Problem with current approach**:
- 符号搜索与smart_search没有本质区别
- 依赖预索引数据,不能实时反映代码变化
- 不支持advanced LSP功能(rename, code actions等)
**Advantages of real LSP**:
- ✅ Real-time code intelligence
- ✅ Supported by all major IDEs (VSCode, Neovim, Sublime, etc.)
- ✅ Standard protocol (Language Server Protocol)
- ✅ Advanced features: rename, code actions, formatting
- ✅ Language-agnostic (TypeScript, Python, Go, Rust, Java, etc.)
---
## Architecture Design
### System Architecture
```
┌─────────────────────────────────────────────────────────┐
│ Client Layer │
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
│ │ VS Code │ │ Neovim │ │ Sublime │ │
│ │ (LSP Client) │ │ (LSP Client) │ │ (LSP Client) │ │
│ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │
│ │ │ │ │
└─────────┼─────────────────┼─────────────────┼───────────┘
│ LSP Protocol │ │
│ (JSON-RPC/stdio)│ │
┌─────────▼─────────────────▼─────────────────▼───────────┐
│ CodexLens LSP Server Bridge │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ LSP Protocol Handler (pygls) │ │
│ │ • initialize / shutdown │ │
│ │ • textDocument/definition │ │
│ │ • textDocument/references │ │
│ │ • textDocument/hover │ │
│ │ • textDocument/completion │ │
│ │ • textDocument/formatting │ │
│ │ • workspace/symbol │ │
│ └────────────────────┬────────────────────────────────┘ │
│ │ │
│ ┌────────────────────▼────────────────────────────────┐ │
│ │ Language Server Multiplexer │ │
│ │ • File type routing (ts→tsserver, py→pylsp, etc.) │ │
│ │ • Multi-server management │ │
│ │ • Request forwarding & response formatting │ │
│ └────────────────────┬────────────────────────────────┘ │
│ │ │
│ ┌────────────────────▼────────────────────────────────┐ │
│ │ Language Servers (Spawned) │ │
│ │ ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ │ │
│ │ │tsserver│ │ pylsp │ │ gopls │ │rust- │ │ │
│ │ │ │ │ │ │ │ │analyzer│ │ │
│ │ └────────┘ └────────┘ └────────┘ └────────┘ │ │
│ └─────────────────────────────────────────────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ Codex-Lens Core (Optional - MCP Layer) │ │
│ │ • Semantic search │ │
│ │ • Custom MCP tools (enrich_prompt, etc.) │ │
│ │ • Hook system (pre-tool, post-tool) │ │
│ └─────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────┘
```
### Key Differences from Index-Based Approach
1. **Request Flow**
- Index: Query → Database → Results
- LSP: Request → Route to LS → LS processes live code → Results
2. **Configuration**
- Index: Depends on indexing state
- LSP: Depends on installed language servers
3. **Latency Profile**
- Index: Consistent (~50ms)
- LSP: Variable (50-500ms depending on LS performance)
---
## Implementation Phases
### Phase 1: LSP Server Bridge (Foundation)
**Duration**: ~3-5 days
**Complexity**: Medium
**Dependencies**: pygls library
#### 1.1 Setup & Dependencies
**File**: `pyproject.toml`
```toml
[project.optional-dependencies]
lsp = [
"pygls>=1.3.0",
"lsprotocol>=2023.0.0",
]
[project.scripts]
codexlens-lsp = "codexlens.lsp.server:main"
```
**Installation**:
```bash
pip install -e ".[lsp]"
```
#### 1.2 LSP Server Core
**Files to create**:
1. `src/codexlens/lsp/__init__.py` - Package init
2. `src/codexlens/lsp/server.py` - Server entry point
3. `src/codexlens/lsp/multiplexer.py` - LS routing & management
4. `src/codexlens/lsp/handlers.py` - LSP request handlers
**Key responsibilities**:
- Initialize LSP server via pygls
- Handle client capabilities negotiation
- Route requests to appropriate language servers
- Format language server responses to LSP format
#### 1.3 Acceptance Criteria
- [ ] Server starts with `codexlens-lsp --stdio`
- [ ] Responds to `initialize` request
- [ ] Spawns language servers on demand
- [ ] Handles `shutdown` cleanly
- [ ] No crashes on malformed requests
---
### Phase 2: Language Server Multiplexer
**Duration**: ~5-7 days
**Complexity**: High
**Dependencies**: Phase 1 complete
#### 2.1 Multi-Server Management
**File**: `src/codexlens/lsp/multiplexer.py`
**Responsibilities**:
- Spawn language servers based on file extension
- Maintain server process lifecycle
- Route requests by document type
- Handle server crashes & restarts
**Supported Language Servers**:
| Language | Server | Installation |
|----------|--------|--------------|
| TypeScript | `typescript-language-server` | `npm i -g typescript-language-server` |
| Python | `pylsp` | `pip install python-lsp-server` |
| Go | `gopls` | `go install golang.org/x/tools/gopls@latest` |
| Rust | `rust-analyzer` | `rustup component add rust-analyzer` |
| Java | `jdtls` | Download JDTLS |
| C/C++ | `clangd` | `apt install clangd` |
#### 2.2 Configuration
**File**: `codexlens-lsp.json` (user config)
```json
{
"languageServers": {
"typescript": {
"command": ["typescript-language-server", "--stdio"],
"extensions": ["ts", "tsx", "js", "jsx"],
"rootDir": "."
},
"python": {
"command": ["pylsp"],
"extensions": ["py", "pyi"],
"rootDir": ".",
"settings": {
"pylsp": {
"plugins": {
"pycodestyle": { "enabled": true },
"pylint": { "enabled": false }
}
}
}
},
"go": {
"command": ["gopls"],
"extensions": ["go"],
"rootDir": "."
},
"rust": {
"command": ["rust-analyzer"],
"extensions": ["rs"],
"rootDir": "."
}
},
"debug": false,
"logLevel": "info"
}
```
#### 2.3 Acceptance Criteria
- [ ] Routes requests to correct LS based on file type
- [ ] Spawns servers on first request
- [ ] Reuses existing server instances
- [ ] Handles server restarts on crash
- [ ] Respects initialization options from config
---
### Phase 3: Core LSP Handlers
**Duration**: ~5-7 days
**Complexity**: Medium
**Dependencies**: Phase 1-2 complete
#### 3.1 Essential Handlers
Implement LSP request handlers for core functionality:
**Handler Mapping**:
```python
Handlers = {
# Navigation
"textDocument/definition": handle_definition,
"textDocument/references": handle_references,
"textDocument/declaration": handle_declaration,
# Hover & Info
"textDocument/hover": handle_hover,
"textDocument/signatureHelp": handle_signature_help,
# Completion
"textDocument/completion": handle_completion,
"completionItem/resolve": handle_completion_resolve,
# Symbols
"textDocument/documentSymbol": handle_document_symbols,
"workspace/symbol": handle_workspace_symbols,
# Editing
"textDocument/formatting": handle_formatting,
"textDocument/rangeFormatting": handle_range_formatting,
"textDocument/rename": handle_rename,
# Diagnostics
"textDocument/publishDiagnostics": handle_publish_diagnostics,
# Misc
"textDocument/codeAction": handle_code_action,
"textDocument/codeLens": handle_code_lens,
}
```
#### 3.2 Request Forwarding Logic
```python
def forward_request_to_lsp(handler_name, params):
"""Forward request to appropriate language server."""
# Extract document info
document_uri = params.get("textDocument", {}).get("uri")
file_ext = extract_extension(document_uri)
# Get language server
ls = multiplexer.get_server(file_ext)
if not ls:
return {"error": f"No LS for {file_ext}"}
# Convert position (1-based → 0-based)
normalized_params = normalize_positions(params)
# Forward to LS
response = ls.send_request(handler_name, normalized_params)
# Convert response format
return normalize_response(response)
```
#### 3.3 Acceptance Criteria
- [ ] All handlers implemented and tested
- [ ] Proper position coordinate conversion (LSP is 0-based, user-facing is 1-based)
- [ ] Error handling for missing language servers
- [ ] Response formatting matches LSP spec
- [ ] Latency < 500ms for 95th percentile
---
### Phase 4: Advanced Features
**Duration**: ~3-5 days
**Complexity**: Medium
**Dependencies**: Phase 1-3 complete
#### 4.1 Position Tolerance (cclsp-like feature)
Some LSP clients (like Claude Code with fuzzy positions) may send imprecise positions. Implement retry logic:
```python
def find_symbol_with_tolerance(ls, uri, position, max_attempts=5):
"""Try multiple position offsets if exact position fails."""
positions_to_try = [
position, # Original
(position.line - 1, position.char), # One line up
(position.line + 1, position.char), # One line down
(position.line, max(0, position.char - 1)), # One char left
(position.line, position.char + 1), # One char right
]
for pos in positions_to_try:
try:
result = ls.send_request("textDocument/definition", {
"textDocument": {"uri": uri},
"position": pos
})
if result:
return result
except:
continue
return None
```
#### 4.2 MCP Integration (Optional)
Extend with MCP provider for Claude Code hooks:
```python
class MCPBridgeHandler:
"""Bridge LSP results into MCP context."""
def build_mcp_context_from_lsp(self, symbol_name, lsp_results):
"""Convert LSP responses to MCP context."""
# Implementation
pass
```
#### 4.3 Acceptance Criteria
- [ ] Position tolerance working (≥3 positions tried)
- [ ] MCP context generation functional
- [ ] Hook system integration complete
- [ ] All test coverage > 80%
---
### Phase 5: Deployment & Documentation
**Duration**: ~2-3 days
**Complexity**: Low
**Dependencies**: Phase 1-4 complete
#### 5.1 Installation & Setup Guide
Create comprehensive documentation:
- Installation instructions for each supported language
- Configuration guide
- Troubleshooting
- Performance tuning
#### 5.2 CLI Tools
```bash
# Start LSP server
codexlens-lsp --stdio
# Check configured language servers
codexlens-lsp --list-servers
# Validate configuration
codexlens-lsp --validate-config
# Show logs
codexlens-lsp --log-level debug
```
#### 5.3 Acceptance Criteria
- [ ] Documentation complete with examples
- [ ] All CLI commands working
- [ ] Integration tested with VS Code, Neovim
- [ ] Performance benchmarks documented
---
## Module Structure
```
src/codexlens/lsp/
├── __init__.py # Package exports
├── server.py # LSP server entry point
├── multiplexer.py # Language server manager
├── handlers.py # LSP request handlers
├── position_utils.py # Coordinate conversion utilities
├── process_manager.py # Language server process lifecycle
├── response_formatter.py # LSP response formatting
└── config.py # Configuration loading
tests/lsp/
├── test_multiplexer.py # LS routing tests
├── test_handlers.py # Handler tests
├── test_position_conversion.py # Coordinate tests
├── test_integration.py # Full LSP handshake
└── fixtures/
├── sample_python.py # Test files
└── sample_typescript.ts
```
---
## Dependency Graph
```
Phase 5 (Deployment)
Phase 4 (Advanced Features)
Phase 3 (Core Handlers)
├─ Depends on: Phase 2
├─ Depends on: Phase 1
└─ Deliverable: Full LSP functionality
Phase 2 (Multiplexer)
├─ Depends on: Phase 1
└─ Deliverable: Multi-server routing
Phase 1 (Server Bridge)
└─ Deliverable: Basic LSP server
```
---
## Technology Stack
| Component | Technology | Rationale |
|-----------|-----------|-----------|
| LSP Implementation | `pygls` | Mature, well-maintained |
| Protocol | LSP 3.17+ | Latest stable version |
| Process Management | `subprocess` + `psutil` | Standard Python, no external deps |
| Configuration | JSON | Simple, widely understood |
| Logging | `logging` module | Built-in, standard |
| Testing | `pytest` + `pytest-asyncio` | Industry standard |
---
## Risk Assessment
| Risk | Probability | Impact | Mitigation |
|------|-------------|--------|------------|
| Language server crashes | Medium | High | Auto-restart with exponential backoff |
| Configuration errors | Medium | Medium | Validation on startup |
| Performance degradation | Low | High | Implement caching + benchmarks |
| Position mismatch issues | Medium | Low | Tolerance layer (try multiple positions) |
| Memory leaks (long sessions) | Low | Medium | Connection pooling + cleanup timers |
---
## Success Metrics
1. **Functionality**: All 7 core LSP handlers working
2. **Performance**: p95 latency < 500ms for typical requests
3. **Reliability**: 99.9% uptime in production
4. **Coverage**: >80% code coverage
5. **Documentation**: Complete with examples
6. **Multi-language**: Support for 5+ languages
---
## Comparison: This Approach vs Alternatives
### Option A: Real LSP Server (This Plan) ✅ RECOMMENDED
**Pros**:
- ✅ True real-time code intelligence
- ✅ Supports all LSP clients (VSCode, Neovim, Sublime, Emacs, etc.)
- ✅ Advanced features (rename, code actions, formatting)
- ✅ Language-agnostic
- ✅ Follows industry standard protocol
**Cons**:
- ❌ More complex implementation
- ❌ Depends on external language servers
- ❌ Higher latency than index-based
**Effort**: ~20-25 days
---
### Option B: Enhanced Index-Based (Current Approach)
**Pros**:
- ✅ Simple implementation
- ✅ Fast (<50ms)
- ✅ No external dependencies
**Cons**:
- ❌ Same as smart_search (user's concern)
- ❌ Stale data between re-indexes
- ❌ Limited to indexed symbols
- ❌ No advanced LSP features
**Effort**: ~5-10 days
---
### Option C: Hybrid (LSP + Index)
**Pros**:
- ✅ Real-time from LSP
- ✅ Fallback to index
- ✅ Best of both worlds
**Cons**:
- ❌ Highest complexity
- ❌ Difficult to debug conflicts
- ❌ Higher maintenance burden
**Effort**: ~30-35 days
---
## Next Steps
1. **Approve Plan**: Confirm this approach matches requirements
2. **Setup Dev Environment**: Install language servers
3. **Phase 1 Implementation**: Start with server bridge
4. **Iterative Testing**: Test each phase with real IDE integration
5. **Documentation**: Maintain docs as implementation progresses
---
---
## Appendix A: VSCode Bridge Implementation
### A.1 Overview
VSCode Bridge 是另一种集成方式通过VSCode扩展暴露其内置LSP功能给外部工具如CCW MCP Server
**Architecture**:
```
┌─────────────────────────────────────────────────────────────────┐
│ Claude Code / CCW │
│ (MCP Client / CLI) │
└───────────────────────────┬─────────────────────────────────────┘
│ MCP Tool Call (vscode_lsp)
┌───────────────────────────▼─────────────────────────────────────┐
│ CCW MCP Server │
│ ┌─────────────────────────────────────────────────────────────┐ │
│ │ vscode_lsp Tool │ │
│ │ • HTTP client to VSCode Bridge │ │
│ │ • Parameter validation (Zod) │ │
│ │ • Response formatting │ │
│ └────────────────────────┬────────────────────────────────────┘ │
└───────────────────────────┼─────────────────────────────────────┘
│ HTTP POST (localhost:3457)
┌───────────────────────────▼─────────────────────────────────────┐
│ ccw-vscode-bridge Extension │
│ ┌─────────────────────────────────────────────────────────────┐ │
│ │ HTTP Server (port 3457) │ │
│ │ Endpoints: │ │
│ │ • POST /get_definition │ │
│ │ • POST /get_references │ │
│ │ • POST /get_hover │ │
│ │ • POST /get_document_symbols │ │
│ └────────────────────────┬────────────────────────────────────┘ │
│ │ │
│ ┌────────────────────────▼────────────────────────────────────┐ │
│ │ VSCode API Calls │ │
│ │ vscode.commands.executeCommand(): │ │
│ │ • vscode.executeDefinitionProvider │ │
│ │ • vscode.executeReferenceProvider │ │
│ │ • vscode.executeHoverProvider │ │
│ │ • vscode.executeDocumentSymbolProvider │ │
│ └─────────────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────────┘
│ VSCode LSP Integration
┌───────────────────────────▼─────────────────────────────────────┐
│ VSCode Language Services │
│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │
│ │TypeScript│ │ Python │ │ Go │ │ Rust │ │
│ │ Server │ │ Server │ │ (gopls) │ │Analyzer │ │
│ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │
└─────────────────────────────────────────────────────────────────┘
```
### A.2 Component Files
**已创建的文件**:
1. `ccw-vscode-bridge/package.json` - VSCode扩展配置
2. `ccw-vscode-bridge/tsconfig.json` - TypeScript配置
3. `ccw-vscode-bridge/src/extension.ts` - 扩展主代码
4. `ccw-vscode-bridge/.vscodeignore` - 打包排除文件
5. `ccw-vscode-bridge/README.md` - 使用文档
**待创建的文件**:
1. `ccw/src/tools/vscode-lsp.ts` - MCP工具实现
2. `ccw/src/tools/index.ts` - 注册新工具
### A.3 VSCode Bridge Extension Implementation
**File**: `ccw-vscode-bridge/src/extension.ts`
```typescript
// 核心功能:
// 1. 启动HTTP服务器监听3457端口
// 2. 接收POST请求解析JSON body
// 3. 调用VSCode内置LSP命令
// 4. 返回JSON结果
// HTTP Endpoints:
// POST /get_definition → vscode.executeDefinitionProvider
// POST /get_references → vscode.executeReferenceProvider
// POST /get_hover → vscode.executeHoverProvider
// POST /get_document_symbols → vscode.executeDocumentSymbolProvider
```
### A.4 MCP Tool Implementation
**File**: `ccw/src/tools/vscode-lsp.ts`
```typescript
/**
* MCP tool that communicates with VSCode Bridge extension.
*
* Actions:
* - get_definition: Find symbol definition
* - get_references: Find all references
* - get_hover: Get hover information
* - get_document_symbols: List symbols in file
*
* Required:
* - ccw-vscode-bridge extension running in VSCode
* - File must be open in VSCode for accurate results
*/
const schema: ToolSchema = {
name: 'vscode_lsp',
description: `Access live VSCode LSP features...`,
inputSchema: {
type: 'object',
properties: {
action: { type: 'string', enum: [...] },
file_path: { type: 'string' },
line: { type: 'number' },
character: { type: 'number' }
},
required: ['action', 'file_path']
}
};
```
### A.5 Advantages vs Standalone LSP Server
| Feature | VSCode Bridge | Standalone LSP Server |
|---------|--------------|----------------------|
| **Setup Complexity** | Low (VSCode ext) | Medium (multiple LS) |
| **Language Support** | Automatic (VSCode) | Manual config |
| **Maintenance** | Low | Medium |
| **IDE Independence** | VSCode only | Any LSP client |
| **Performance** | Good | Good |
| **Advanced Features** | Full VSCode support | LSP standard |
---
## Appendix B: Complete Integration Architecture
### B.1 Three Integration Paths
```
┌─────────────────────────────────────────────────────────────────────────────┐
│ CodexLens Integration Paths │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ Path 1: VSCode Bridge (HTTP) Path 2: Standalone LSP Server │
│ ──────────────────────── ───────────────────────────── │
│ │
│ ┌─────────────┐ ┌─────────────┐ │
│ │ CCW MCP │ │ Any LSP │ │
│ │ vscode_lsp │ │ Client │ │
│ └──────┬──────┘ └──────┬──────┘ │
│ │ HTTP │ LSP/stdio │
│ ▼ ▼ │
│ ┌─────────────┐ ┌─────────────┐ │
│ │ ccw-vscode │ │ codexlens- │ │
│ │ -bridge │ │ lsp │ │
│ └──────┬──────┘ └──────┬──────┘ │
│ │ VSCode API │ Child Process │
│ ▼ ▼ │
│ ┌─────────────┐ ┌─────────────┐ │
│ │ VSCode │ │ pylsp │ │
│ │ LS │ │ tsserver │ │
│ └─────────────┘ │ gopls │ │
│ └─────────────┘ │
│ │
│ Path 3: Index-Based (Current) │
│ ───────────────────────────── │
│ │
│ ┌─────────────┐ │
│ │ CCW MCP │ │
│ │codex_lens_lsp│ │
│ └──────┬──────┘ │
│ │ Python subprocess │
│ ▼ │
│ ┌─────────────┐ │
│ │ CodexLens │ │
│ │ Index DB │ │
│ └─────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
```
### B.2 Recommendation Matrix
| Use Case | Recommended Path | Reason |
|----------|-----------------|--------|
| Claude Code + VSCode | Path 1: VSCode Bridge | Simplest, full VSCode features |
| CLI-only workflows | Path 2: Standalone LSP | No VSCode dependency |
| Quick search across indexed code | Path 3: Index-based | Fastest response |
| Multi-IDE support | Path 2: Standalone LSP | Standard protocol |
| Advanced refactoring | Path 1: VSCode Bridge | Full VSCode capabilities |
### B.3 Hybrid Mode (Recommended)
For maximum flexibility, implement all three paths:
```javascript
// Smart routing in CCW
function selectLSPPath(request) {
// 1. Try VSCode Bridge first (if available)
if (await checkVSCodeBridge()) {
return "vscode_bridge";
}
// 2. Fall back to Standalone LSP
if (await checkStandaloneLSP(request.fileType)) {
return "standalone_lsp";
}
// 3. Last resort: Index-based
return "index_based";
}
```
---
## Appendix C: Implementation Tasks Summary
### C.1 VSCode Bridge Tasks
| Task ID | Description | Priority | Status |
|---------|-------------|----------|--------|
| VB-1 | Create ccw-vscode-bridge extension structure | High | ✅ Done |
| VB-2 | Implement HTTP server in extension.ts | High | ✅ Done |
| VB-3 | Create vscode_lsp MCP tool | High | 🔄 Pending |
| VB-4 | Register tool in CCW | High | 🔄 Pending |
| VB-5 | Test with VSCode | Medium | 🔄 Pending |
| VB-6 | Add connection retry logic | Low | 🔄 Pending |
### C.2 Standalone LSP Server Tasks
| Task ID | Description | Priority | Status |
|---------|-------------|----------|--------|
| LSP-1 | Setup pygls project structure | High | 🔄 Pending |
| LSP-2 | Implement multiplexer | High | 🔄 Pending |
| LSP-3 | Core handlers (definition, references) | High | 🔄 Pending |
| LSP-4 | Position tolerance | Medium | 🔄 Pending |
| LSP-5 | Tests and documentation | Medium | 🔄 Pending |
### C.3 Integration Tasks
| Task ID | Description | Priority | Status |
|---------|-------------|----------|--------|
| INT-1 | Smart path routing | Medium | 🔄 Pending |
| INT-2 | Unified error handling | Medium | 🔄 Pending |
| INT-3 | Performance benchmarks | Low | 🔄 Pending |
---
## Questions for Clarification
Before implementation, confirm:
1. **Implementation Priority**: Start with VSCode Bridge (simpler) or Standalone LSP (more general)?
2. **Language Priority**: Which languages are most important? (TypeScript, Python, Go, Rust, etc.)
3. **IDE Focus**: Target VS Code first, then others?
4. **Fallback Strategy**: Should we keep index-based search as fallback if LSP fails?
5. **Caching**: How much should we cache LS responses?
6. **Configuration**: Simple JSON config or more sophisticated format?

View File

@@ -1,192 +0,0 @@
# CodexLens 搜索分析 - 执行摘要
## 🎯 核心发现
### 问题1向量搜索为什么返回空结果
**根本原因**:向量嵌入数据不存在
-`semantic_chunks` 表未创建
- ✗ 从未执行向量嵌入生成流程
- ✗ 向量索引数据库实际是 SQLite 中的一个表,不是独立文件
**位置**:向量数据存储在 `~/.codexlens/indexes/项目名/_index.db``semantic_chunks` 表中
### 问题2向量索引数据库在哪里
**存储架构**
```
~/.codexlens/indexes/
└── project-name/
└── _index.db ← SQLite数据库
├── files ← 文件索引表
├── files_fts ← FTS5全文索引
├── files_fts_fuzzy ← 模糊搜索索引
└── semantic_chunks ← 向量嵌入表(当前不存在!)
```
**不是独立数据库**:向量数据集成在 SQLite 索引文件中,而不是单独的向量数据库。
### 问题3当前架构是否发挥了并行效果
**✓ 是的!架构非常优秀**
- **双层并行**
- 第1层单索引内exact/fuzzy/vector 三种搜索方法并行
- 第2层跨多个目录索引并行搜索
- **性能表现**:混合模式仅增加 1.6x 开销9ms vs 5.6ms
- **资源利用**ThreadPoolExecutor 充分利用 I/O 并发
## ⚡ 快速修复
### 立即解决向量搜索问题
**步骤1安装依赖**
```bash
pip install codexlens[semantic]
# 或
pip install fastembed numpy
```
**步骤2生成向量嵌入**
创建脚本 `generate_embeddings.py`:
```python
from pathlib import Path
from codexlens.semantic.embedder import Embedder
from codexlens.semantic.vector_store import VectorStore
from codexlens.semantic.chunker import Chunker, ChunkConfig
import sqlite3
def generate_embeddings(index_db_path: Path):
embedder = Embedder(profile="code")
vector_store = VectorStore(index_db_path)
chunker = Chunker(config=ChunkConfig(max_chunk_size=2000))
with sqlite3.connect(index_db_path) as conn:
conn.row_factory = sqlite3.Row
files = conn.execute("SELECT full_path, content FROM files").fetchall()
for file_row in files:
chunks = chunker.chunk_sliding_window(
file_row["content"],
file_path=file_row["full_path"],
language="python"
)
for chunk in chunks:
chunk.embedding = embedder.embed_single(chunk.content)
if chunks:
vector_store.add_chunks(chunks, file_row["full_path"])
```
**步骤3执行生成**
```bash
python generate_embeddings.py ~/.codexlens/indexes/codex-lens/_index.db
```
**步骤4验证**
```bash
# 检查数据
sqlite3 ~/.codexlens/indexes/codex-lens/_index.db \
"SELECT COUNT(*) FROM semantic_chunks"
# 测试搜索
codexlens search "authentication credentials" --mode vector
```
## 🔍 关键洞察
### 发现Vector模式不是纯向量搜索
**当前行为**
```python
# hybrid_search.py:73
backends = {"exact": True} # ⚠️ exact搜索总是启用
if enable_vector:
backends["vector"] = True
```
**影响**
- "vector模式"实际是 **vector + exact 混合模式**
- 即使向量搜索返回空仍有exact FTS结果
- 这就是为什么"向量搜索"在无嵌入时也有结果
**建议修复**:添加 `pure_vector` 参数以支持真正的纯向量搜索
## 📊 搜索模式对比
| 模式 | 延迟 | 召回率 | 适用场景 | 需要嵌入 |
|------|------|--------|----------|---------|
| **exact** | 5.6ms | 中 | 代码标识符 | ✗ |
| **fuzzy** | 7.7ms | 高 | 容错搜索 | ✗ |
| **vector** | 7.4ms | 最高 | 语义搜索 | ✓ |
| **hybrid** | 9.0ms | 最高 | 通用搜索 | ✓ |
**推荐**
- 代码搜索 → `--mode exact`
- 自然语言 → `--mode hybrid`(需先生成嵌入)
- 容错搜索 → `--mode fuzzy`
## 📈 优化路线图
### P0 - 立即 (本周)
- [x] 生成向量嵌入
- [ ] 验证向量搜索可用
- [ ] 更新使用文档
### P1 - 短期 (2周)
- [ ] 添加 `pure_vector` 模式
- [ ] 增量嵌入更新
- [ ] 改进错误提示
### P2 - 中期 (1-2月)
- [ ] 混合分块策略
- [ ] 查询扩展
- [ ] 自适应权重
### P3 - 长期 (3-6月)
- [ ] FAISS加速
- [ ] 向量压缩
- [ ] 多模态搜索
## 📚 详细文档
完整分析报告:`SEARCH_COMPARISON_ANALYSIS.md`
包含内容:
- 详细问题诊断
- 架构深度分析
- 完整解决方案
- 代码示例
- 实施检查清单
## 🎓 学习要点
1. **向量搜索需要主动生成嵌入**:不会自动创建
2. **双层并行架构很优秀**:无需额外优化
3. **RRF融合算法工作良好**:多源结果合理融合
4. **Vector模式非纯向量**包含FTS作为后备
## 💡 下一步行动
```bash
# 1. 安装依赖
pip install codexlens[semantic]
# 2. 创建索引(如果还没有)
codexlens init ~/projects/your-project
# 3. 生成嵌入
python generate_embeddings.py ~/.codexlens/indexes/your-project/_index.db
# 4. 测试搜索
codexlens search "your natural language query" --mode hybrid
```
---
**问题解决**: ✓ 已识别并提供解决方案
**架构评估**: ✓ 并行架构优秀,充分发挥效能
**优化建议**: ✓ 提供短期、中期、长期优化路线
**联系**: 详见 `SEARCH_COMPARISON_ANALYSIS.md` 获取完整技术细节

View File

@@ -1,711 +0,0 @@
# CodexLens 搜索模式对比分析报告
**生成时间**: 2025-12-16
**分析目标**: 对比向量搜索和混合搜索效果,诊断向量搜索返回空结果的原因,评估并行架构效能
---
## 执行摘要
通过深入的代码分析和实验测试,我们发现了向量搜索在当前实现中的几个关键问题,并提供了针对性的优化方案。
### 核心发现
1. **向量搜索返回空结果的根本原因**缺少向量嵌入数据semantic_chunks表为空
2. **混合搜索架构设计优秀**:使用了双层并行架构,性能表现良好
3. **向量搜索模式的语义问题**"vector模式"实际上总是包含exact搜索不是纯向量搜索
---
## 1. 问题诊断
### 1.1 向量索引数据库位置
**存储架构**
- **位置**: 向量数据集成存储在SQLite索引文件中`_index.db`
- **表名**: `semantic_chunks`
- **字段结构**:
- `id`: 主键
- `file_path`: 文件路径
- `content`: 代码块内容
- `embedding`: 向量嵌入BLOB格式numpy float32数组
- `metadata`: JSON格式元数据
- `created_at`: 创建时间
**默认存储路径**
- 全局索引: `~/.codexlens/indexes/`
- 项目索引: `项目目录/.codexlens/`
- 每个目录一个 `_index.db` 文件
**为什么没有看到向量数据库**
向量数据不是独立数据库而是与FTS索引共存于同一个SQLite文件中的`semantic_chunks`表。如果该表不存在或为空,说明从未生成过向量嵌入。
### 1.2 向量搜索返回空结果的原因
**代码分析** (`hybrid_search.py:195-253`):
```python
def _search_vector(self, index_path: Path, query: str, limit: int) -> List[SearchResult]:
try:
# 检查1: semantic_chunks表是否存在
conn = sqlite3.connect(index_path)
cursor = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
)
has_semantic_table = cursor.fetchone() is not None
conn.close()
if not has_semantic_table:
self.logger.debug("No semantic_chunks table found")
return [] # ❌ 返回空列表
# 检查2: 向量存储是否有数据
vector_store = VectorStore(index_path)
if vector_store.count_chunks() == 0:
self.logger.debug("Vector store is empty")
return [] # ❌ 返回空列表
# 正常向量搜索流程...
except Exception as exc:
return [] # ❌ 异常也返回空列表
```
**失败路径**
1. `semantic_chunks`表不存在 → 返回空
2. 表存在但无数据 → 返回空
3. 语义搜索依赖未安装 → 返回空
4. 任何异常 → 返回空
**当前状态诊断**
通过测试验证,当前项目中:
-`semantic_chunks`表不存在
- ✗ 未执行向量嵌入生成流程
- ✗ 向量索引从未创建
**解决方案**需要执行向量嵌入生成流程见第3节
### 1.3 混合搜索 vs 向量搜索的实际行为
**重要发现**:当前实现中,"vector模式"并非纯向量搜索。
**代码证据** (`hybrid_search.py:72-77`):
```python
def search(self, ...):
# Determine which backends to use
backends = {"exact": True} # ⚠️ exact搜索总是启用
if enable_fuzzy:
backends["fuzzy"] = True
if enable_vector:
backends["vector"] = True
```
**影响**
- 即使设置为"vector模式"`enable_fuzzy=False, enable_vector=True`exact搜索仍然运行
- 当向量搜索返回空时RRF融合仍会包含exact搜索的结果
- 这导致"向量搜索"在没有嵌入数据时仍返回结果来自exact FTS
**测试验证**
```
测试场景有FTS索引但无向量嵌入
查询:"authentication"
预期行为(纯向量模式):
- 向量搜索: 0 结果(无嵌入数据)
- 最终结果: 0
实际行为:
- 向量搜索: 0 结果
- Exact搜索: 3 结果 ✓ (总是运行)
- 最终结果: 3来自exact经过RRF
```
**设计建议**
1. **选项A推荐**: 添加纯向量模式标志
```python
backends = {}
if enable_vector and not pure_vector_mode:
backends["exact"] = True # 向量搜索的后备方案
elif not enable_vector:
backends["exact"] = True # 非向量模式总是启用exact
```
2. **选项B**: 文档明确说明当前行为
- "vector模式"实际是"vector+exact混合模式"
- 提供警告信息当向量搜索返回空时
---
## 2. 并行架构分析
### 2.1 双层并行设计
CodexLens采用了优秀的双层并行架构
**第一层:搜索方法级并行** (`HybridSearchEngine`)
```python
def _search_parallel(self, index_path, query, backends, limit):
with ThreadPoolExecutor(max_workers=len(backends)) as executor:
# 并行提交搜索任务
if backends.get("exact"):
future = executor.submit(self._search_exact, ...)
if backends.get("fuzzy"):
future = executor.submit(self._search_fuzzy, ...)
if backends.get("vector"):
future = executor.submit(self._search_vector, ...)
# 收集结果
for future in as_completed(future_to_source):
results = future.result()
```
**特点**
- 在**单个索引**内exact/fuzzy/vector三种搜索方法并行执行
- 使用`ThreadPoolExecutor`实现I/O密集型任务并行
- 使用`as_completed`实现结果流式收集
- 动态worker数量与启用的backend数量相同
**性能测试结果**
```
搜索模式 | 平均延迟 | 相对overhead
-----------|----------|-------------
Exact only | 5.6ms | 1.0x (基线)
Fuzzy only | 7.7ms | 1.4x
Vector only| 7.4ms | 1.3x
Hybrid (all)| 9.0ms | 1.6x
```
**分析**
- ✓ Hybrid模式开销合理<2x证明并行有效
- ✓ 单次搜索延迟仍保持在10ms以下优秀
**第二层:索引级并行** (`ChainSearchEngine`)
```python
def _search_parallel(self, index_paths, query, options):
executor = self._get_executor(options.max_workers)
# 为每个索引提交搜索任务
future_to_path = {
executor.submit(
self._search_single_index,
idx_path, query, ...
): idx_path
for idx_path in index_paths
}
# 收集所有索引的结果
for future in as_completed(future_to_path):
results = future.result()
all_results.extend(results)
```
**特点**
- 跨**多个目录索引**并行搜索
- 共享线程池(避免线程创建开销)
- 可配置worker数量默认8
- 结果去重和RRF融合
### 2.2 并行效能评估
**优势**
1. ✓ **架构清晰**:双层并行职责明确,互不干扰
2. ✓ **资源利用**I/O密集型任务充分利用线程池
3. ✓ **扩展性**:易于添加新的搜索后端
4. ✓ **容错性**:单个后端失败不影响其他后端
**当前利用率**
- 单索引搜索:并行度 = min(3, 启用的backend数量)
- 多索引搜索:并行度 = min(8, 索引数量)
- **充分发挥**只要有多个索引或多个backend
**潜在优化点**
1. **CPU密集型任务**向量相似度计算已使用numpy向量化无需额外并行
2. **缓存优化**`VectorStore`已实现embedding matrix缓存性能良好
3. **动态worker调度**当前固定worker数可根据任务负载动态调整
---
## 3. 解决方案与优化建议
### 3.1 立即修复:生成向量嵌入
**步骤1安装语义搜索依赖**
```bash
# 方式A完整安装
pip install codexlens[semantic]
# 方式B手动安装依赖
pip install fastembed numpy
```
**步骤2创建向量索引脚本**
保存为 `scripts/generate_embeddings.py`:
```python
"""Generate vector embeddings for existing indexes."""
import logging
import sqlite3
from pathlib import Path
from codexlens.semantic.embedder import Embedder
from codexlens.semantic.vector_store import VectorStore
from codexlens.semantic.chunker import Chunker, ChunkConfig
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def generate_embeddings_for_index(index_db_path: Path):
"""Generate embeddings for all files in an index."""
logger.info(f"Processing index: {index_db_path}")
# Initialize components
embedder = Embedder(profile="code") # Use code-optimized model
vector_store = VectorStore(index_db_path)
chunker = Chunker(config=ChunkConfig(max_chunk_size=2000))
# Read files from index
with sqlite3.connect(index_db_path) as conn:
conn.row_factory = sqlite3.Row
cursor = conn.execute("SELECT full_path, content, language FROM files")
files = cursor.fetchall()
logger.info(f"Found {len(files)} files to process")
# Process each file
total_chunks = 0
for file_row in files:
file_path = file_row["full_path"]
content = file_row["content"]
language = file_row["language"] or "python"
try:
# Create chunks
chunks = chunker.chunk_sliding_window(
content,
file_path=file_path,
language=language
)
if not chunks:
logger.debug(f"No chunks created for {file_path}")
continue
# Generate embeddings
for chunk in chunks:
embedding = embedder.embed_single(chunk.content)
chunk.embedding = embedding
# Store chunks
vector_store.add_chunks(chunks, file_path)
total_chunks += len(chunks)
logger.info(f"✓ {file_path}: {len(chunks)} chunks")
except Exception as exc:
logger.error(f"✗ {file_path}: {exc}")
logger.info(f"Completed: {total_chunks} total chunks indexed")
return total_chunks
def main():
import sys
if len(sys.argv) < 2:
print("Usage: python generate_embeddings.py <index_db_path>")
print("Example: python generate_embeddings.py ~/.codexlens/indexes/project/_index.db")
sys.exit(1)
index_path = Path(sys.argv[1])
if not index_path.exists():
print(f"Error: Index not found at {index_path}")
sys.exit(1)
generate_embeddings_for_index(index_path)
if __name__ == "__main__":
main()
```
**步骤3执行生成**
```bash
# 为特定项目生成嵌入
python scripts/generate_embeddings.py ~/.codexlens/indexes/codex-lens/_index.db
# 或使用find批量处理
find ~/.codexlens/indexes -name "_index.db" -type f | while read db; do
python scripts/generate_embeddings.py "$db"
done
```
**步骤4验证生成结果**
```bash
# 检查semantic_chunks表
sqlite3 ~/.codexlens/indexes/codex-lens/_index.db \
"SELECT COUNT(*) as chunk_count FROM semantic_chunks"
# 测试向量搜索
codexlens search "authentication user credentials" \
--path ~/projects/codex-lens \
--mode vector
```
### 3.2 短期优化:改进向量搜索语义
**问题**:当前"vector模式"实际包含exact搜索语义不清晰
**解决方案**:添加`pure_vector`参数
**实现** (修改 `hybrid_search.py`):
```python
class HybridSearchEngine:
def search(
self,
index_path: Path,
query: str,
limit: int = 20,
enable_fuzzy: bool = True,
enable_vector: bool = False,
pure_vector: bool = False, # 新增参数
) -> List[SearchResult]:
"""Execute hybrid search with parallel retrieval and RRF fusion.
Args:
...
pure_vector: If True, only use vector search (no FTS fallback)
"""
# Determine which backends to use
backends = {}
if pure_vector:
# 纯向量模式:只使用向量搜索
if enable_vector:
backends["vector"] = True
else:
# 混合模式总是包含exact搜索作为基线
backends["exact"] = True
if enable_fuzzy:
backends["fuzzy"] = True
if enable_vector:
backends["vector"] = True
# ... rest of the method
```
**CLI更新** (修改 `commands.py`):
```python
@app.command()
def search(
...
mode: str = typer.Option("exact", "--mode", "-m",
help="Search mode: exact, fuzzy, hybrid, vector, pure-vector."),
...
):
"""...
Search Modes:
- exact: Exact FTS
- fuzzy: Fuzzy FTS
- hybrid: RRF fusion of exact + fuzzy + vector (recommended)
- vector: Vector search with exact FTS fallback
- pure-vector: Pure semantic vector search (no FTS fallback)
"""
...
# Map mode to options
if mode == "exact":
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = False, False, False, False
elif mode == "fuzzy":
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = False, True, False, False
elif mode == "vector":
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, False, True, False
elif mode == "pure-vector":
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, False, True, True
elif mode == "hybrid":
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, True, True, False
```
### 3.3 中期优化:增强向量搜索效果
**优化1改进分块策略**
当前使用简单的滑动窗口,可优化为:
```python
class HybridChunker(Chunker):
"""Hybrid chunking strategy combining symbol-based and sliding window."""
def chunk_hybrid(
self,
content: str,
symbols: List[Symbol],
file_path: str,
language: str,
) -> List[SemanticChunk]:
"""
1. 优先按symbol分块函数、类级别
2. 对过大symbol进一步使用滑动窗口
3. 对symbol间隙使用滑动窗口补充
"""
chunks = []
# Step 1: Symbol-based chunks
symbol_chunks = self.chunk_by_symbol(content, symbols, file_path, language)
# Step 2: Split oversized symbols
for chunk in symbol_chunks:
if chunk.token_count > self.config.max_chunk_size:
# 使用滑动窗口进一步分割
sub_chunks = self._split_large_chunk(chunk)
chunks.extend(sub_chunks)
else:
chunks.append(chunk)
# Step 3: Fill gaps with sliding window
gap_chunks = self._chunk_gaps(content, symbols, file_path, language)
chunks.extend(gap_chunks)
return chunks
```
**优化2添加查询扩展**
```python
class QueryExpander:
"""Expand queries for better vector search recall."""
def expand(self, query: str) -> str:
"""Expand query with synonyms and related terms."""
# 示例:代码领域同义词
expansions = {
"auth": ["authentication", "authorization", "login"],
"db": ["database", "storage", "repository"],
"api": ["endpoint", "route", "interface"],
}
terms = query.lower().split()
expanded = set(terms)
for term in terms:
if term in expansions:
expanded.update(expansions[term])
return " ".join(expanded)
```
**优化3混合检索策略**
```python
class AdaptiveHybridSearch:
"""Adaptive search strategy based on query type."""
def search(self, query: str, ...):
# 分析查询类型
query_type = self._classify_query(query)
if query_type == "keyword":
# 代码标识符查询 → 偏重FTS
weights = {"exact": 0.5, "fuzzy": 0.3, "vector": 0.2}
elif query_type == "semantic":
# 自然语言查询 → 偏重向量
weights = {"exact": 0.2, "fuzzy": 0.2, "vector": 0.6}
elif query_type == "hybrid":
# 混合查询 → 平衡权重
weights = {"exact": 0.4, "fuzzy": 0.3, "vector": 0.3}
return self.engine.search(query, weights=weights, ...)
```
### 3.4 长期优化:性能与质量提升
**优化1增量嵌入更新**
```python
class IncrementalEmbeddingUpdater:
"""Update embeddings incrementally for changed files."""
def update_for_file(self, file_path: str, new_content: str):
"""Only regenerate embeddings for changed file."""
# 1. 删除旧嵌入
self.vector_store.delete_file_chunks(file_path)
# 2. 生成新嵌入
chunks = self.chunker.chunk(new_content, ...)
for chunk in chunks:
chunk.embedding = self.embedder.embed_single(chunk.content)
# 3. 存储新嵌入
self.vector_store.add_chunks(chunks, file_path)
```
**优化2向量索引压缩**
```python
# 使用量化技术减少存储空间768维 → 192维
from qdrant_client import models
# 产品量化PQ压缩
compressed_vector = pq_quantize(embedding, target_dim=192)
```
**优化3向量搜索加速**
```python
# 使用FAISS或Hnswlib替代numpy暴力搜索
import faiss
class FAISSVectorStore(VectorStore):
def __init__(self, db_path, dim=768):
super().__init__(db_path)
# 使用HNSW索引
self.index = faiss.IndexHNSWFlat(dim, 32)
self._load_vectors_to_index()
def search_similar(self, query_embedding, top_k=10):
# FAISS加速搜索100x+
scores, indices = self.index.search(
np.array([query_embedding]), top_k
)
return self._fetch_by_indices(indices[0], scores[0])
```
---
## 4. 对比总结
### 4.1 搜索模式对比
| 维度 | Exact FTS | Fuzzy FTS | Vector Search | Hybrid (推荐) |
|------|-----------|-----------|---------------|--------------|
| **匹配类型** | 精确词匹配 | 容错匹配 | 语义相似 | 多模式融合 |
| **查询类型** | 标识符、关键词 | 拼写错误容忍 | 自然语言 | 所有类型 |
| **召回率** | 中 | 高 | 最高 | 最高 |
| **精确率** | 高 | 中 | 中 | 高 |
| **延迟** | 5-7ms | 7-9ms | 7-10ms | 9-11ms |
| **依赖** | 仅SQLite | 仅SQLite | fastembed+numpy | 全部 |
| **存储开销** | 小FTS索引 | 小FTS索引 | 大(向量) | 大FTS+向量) |
| **适用场景** | 代码搜索 | 容错搜索 | 概念搜索 | 通用搜索 |
### 4.2 推荐使用策略
**场景1代码标识符搜索**(函数名、类名、变量名)
```bash
codexlens search "authenticate_user" --mode exact
```
→ 使用exact模式最快且最精确
**场景2概念性搜索**"如何验证用户身份"
```bash
codexlens search "how to verify user credentials" --mode hybrid
```
→ 使用hybrid模式结合语义和关键词
**场景3容错搜索**(允许拼写错误)
```bash
codexlens search "autheticate" --mode fuzzy
```
→ 使用fuzzy模式trigram容错
**场景4纯语义搜索**(需先生成嵌入)
```bash
codexlens search "password encryption with salt" --mode pure-vector
```
→ 使用pure-vector模式理解语义意图
---
## 5. 实施检查清单
### 立即行动项 (P0)
- [ ] 安装语义搜索依赖:`pip install codexlens[semantic]`
- [ ] 运行嵌入生成脚本见3.1节)
- [ ] 验证semantic_chunks表已创建且有数据
- [ ] 测试vector模式搜索是否返回结果
### 短期改进 (P1)
- [ ] 添加pure_vector参数见3.2节)
- [ ] 更新CLI支持pure-vector模式
- [ ] 添加嵌入生成进度提示
- [ ] 文档更新:搜索模式使用指南
### 中期优化 (P2)
- [ ] 实现混合分块策略见3.3节)
- [ ] 添加查询扩展功能
- [ ] 实现自适应权重调整
- [ ] 性能基准测试
### 长期规划 (P3)
- [ ] 增量嵌入更新机制
- [ ] 向量索引压缩
- [ ] 集成FAISS加速
- [ ] 多模态搜索(代码+文档)
---
## 6. 参考资源
### 代码文件
- 混合搜索引擎: `codex-lens/src/codexlens/search/hybrid_search.py`
- 向量存储: `codex-lens/src/codexlens/semantic/vector_store.py`
- 向量嵌入: `codex-lens/src/codexlens/semantic/embedder.py`
- 代码分块: `codex-lens/src/codexlens/semantic/chunker.py`
- 链式搜索: `codex-lens/src/codexlens/search/chain_search.py`
### 测试文件
- 对比测试: `codex-lens/tests/test_search_comparison.py`
- 混合搜索E2E: `codex-lens/tests/test_hybrid_search_e2e.py`
- CLI测试: `codex-lens/tests/test_cli_hybrid_search.py`
### 相关文档
- RRF算法: `codex-lens/src/codexlens/search/ranking.py`
- 查询解析: `codex-lens/src/codexlens/search/query_parser.py`
- 配置管理: `codex-lens/src/codexlens/config.py`
---
## 7. 结论
通过本次深入分析我们明确了CodexLens搜索系统的优势和待优化点
**优势**
1. ✓ 优秀的并行架构设计(双层并行)
2. ✓ RRF融合算法实现合理
3. ✓ 向量存储实现高效numpy向量化+缓存)
4. ✓ 模块化设计,易于扩展
**待优化**
1. 向量嵌入生成流程需要手动触发
2. "vector模式"语义不清晰实际包含exact搜索
3. 分块策略可以优化(混合策略)
4. 缺少增量更新机制
**核心建议**
1. **立即**: 生成向量嵌入,解决返回空结果问题
2. **短期**: 添加纯向量模式,澄清语义
3. **中期**: 优化分块和查询策略,提升搜索质量
4. **长期**: 性能优化和高级特性
通过实施这些改进CodexLens的搜索功能将达到生产级别的质量和性能标准。
---
**报告完成时间**: 2025-12-16
**分析工具**: 代码静态分析 + 实验测试 + 性能测评
**下一步**: 实施P0优先级改进项

File diff suppressed because it is too large Load Diff

View File

@@ -1,248 +0,0 @@
# T6: CLI Integration for Hybrid Search - Implementation Summary
## Overview
Successfully integrated hybrid search capabilities into the CodexLens CLI with user-configurable options, migration support, and enhanced status reporting.
## Changes Made
### 1. Search Command Enhancement (`commands.py`)
**New `--mode` Parameter:**
- Replaced `--hybrid` and `--exact-only` flags with unified `--mode` parameter
- Supported modes: `exact`, `fuzzy`, `hybrid`, `vector`
- Default: `exact` (backward compatible)
**Mode Validation:**
```python
valid_modes = ["exact", "fuzzy", "hybrid", "vector"]
if mode not in valid_modes:
# Error with helpful message
```
**Weights Configuration:**
- Accepts custom RRF weights via `--weights exact,fuzzy,vector`
- Example: `--weights 0.5,0.3,0.2`
- Automatic normalization if weights don't sum to 1.0
- Validation for 3-value format
**Mode Mapping to SearchOptions:**
```python
hybrid_mode = mode == "hybrid"
enable_fuzzy = mode in ["fuzzy", "hybrid"]
options = SearchOptions(
hybrid_mode=hybrid_mode,
enable_fuzzy=enable_fuzzy,
hybrid_weights=hybrid_weights,
)
```
**Enhanced Output:**
- Shows search mode in status line
- Includes search source tags in verbose mode
- JSON output includes mode and source information
### 2. Migrate Command (`commands.py`)
**New Command for Dual-FTS Upgrade:**
```bash
codex-lens migrate [path]
```
**Features:**
- Upgrades all `_index.db` files to schema version 4
- Shows progress bar with percentage complete
- Tracks: migrated, already up-to-date, errors
- Safe operation preserving all data
- Verbose mode shows per-database migration details
**Progress Tracking:**
- Uses Rich progress bar with spinner
- Shows percentage and count (N/Total)
- Time elapsed indicator
### 3. Status Command Enhancement (`commands.py`)
**New Backend Status Display:**
```
Search Backends:
Exact FTS: ✓ (unicode61)
Fuzzy FTS: ✓ (trigram)
Hybrid Search: ✓ (RRF fusion)
Vector Search: ✗ (future)
```
**Schema Version Detection:**
- Checks first available `_index.db`
- Reports schema version
- Detects dual FTS table presence
**Feature Flags in JSON:**
```json
{
"features": {
"exact_fts": true,
"fuzzy_fts": true,
"hybrid_search": true,
"vector_search": false
}
}
```
### 4. Output Rendering (`output.py`)
**Verbose Mode Support:**
```python
render_search_results(results, verbose=True)
```
**Search Source Tags:**
- `[E]` - Exact FTS result
- `[F]` - Fuzzy FTS result
- `[V]` - Vector search result
- `[RRF]` - Fusion result
**Enhanced Table:**
- New "Source" column in verbose mode
- Shows result origin for debugging
- Fusion scores visible
## Usage Examples
### 1. Search with Different Modes
```bash
# Exact search (default)
codex-lens search "authentication"
# Fuzzy search only
codex-lens search "authentication" --mode fuzzy
# Hybrid search with RRF fusion
codex-lens search "authentication" --mode hybrid
# Hybrid with custom weights
codex-lens search "authentication" --mode hybrid --weights 0.5,0.3,0.2
# Verbose mode shows source tags
codex-lens search "authentication" --mode hybrid -v
```
### 2. Migration
```bash
# Migrate current project
codex-lens migrate
# Migrate specific project with verbose output
codex-lens migrate /path/to/project -v
# JSON output for automation
codex-lens migrate --json
```
### 3. Status Checking
```bash
# Check backend availability
codex-lens status
# JSON output with feature flags
codex-lens status --json
```
## Testing
**Test Coverage:**
- ✅ Mode parameter validation (exact, fuzzy, hybrid, vector)
- ✅ Weights parsing and normalization
- ✅ Help text shows all modes
- ✅ Migrate command exists and accessible
- ✅ Status command shows backends
- ✅ Mode mapping to SearchOptions
**Test Results:**
```
11 passed in 2.27s
```
## Integration Points
### With Phase 1 (Dual-FTS):
- Uses `search_fts_exact()` for exact mode
- Uses `search_fts_fuzzy()` for fuzzy mode
- Schema migration via `_apply_migrations()`
### With Phase 2 (Hybrid Search):
- Calls `HybridSearchEngine` for hybrid mode
- Passes custom weights to RRF algorithm
- Displays fusion scores and source tags
### With Existing CLI:
- Backward compatible (default mode=exact)
- Follows existing error handling patterns
- Uses Rich for progress and formatting
- Supports JSON output mode
## Done Criteria Verification
**CLI search --mode exact uses only exact FTS table**
- Mode validation ensures correct backend selection
- `hybrid_mode=False, enable_fuzzy=False` for exact mode
**--mode fuzzy uses only fuzzy table**
- `hybrid_mode=False, enable_fuzzy=True` for fuzzy mode
- Single backend execution
**--mode hybrid fuses both**
- `hybrid_mode=True, enable_fuzzy=True` activates RRF fusion
- HybridSearchEngine coordinates parallel search
**Custom weights via --weights 0.5,0.3,0.2**
- Parses 3-value comma-separated format
- Validates and normalizes to sum=1.0
- Passes to RRF algorithm
**Migration command completes Dual-FTS upgrade**
- Shows progress bar with percentage
- Tracks migration status per database
- Safe operation with error handling
**Search output shows [E], [F], [V] tags and fusion scores**
- Verbose mode displays Source column
- Tags extracted from `search_source` attribute
- Fusion scores shown in Score column
## Files Modified
1. `codex-lens/src/codexlens/cli/commands.py`
- Updated `search()` command with `--mode` parameter
- Added `migrate()` command
- Enhanced `status()` command
- Added DirIndexStore import
2. `codex-lens/src/codexlens/cli/output.py`
- Updated `render_search_results()` with verbose mode
- Added source tag display logic
3. `codex-lens/tests/test_cli_hybrid_search.py` (new)
- Comprehensive CLI integration tests
- Mode validation tests
- Weights parsing tests
- Command availability tests
## Performance Impact
- **Exact mode**: Same as before (no overhead)
- **Fuzzy mode**: Single FTS query (minimal overhead)
- **Hybrid mode**: Parallel execution (2x I/O, no sequential penalty)
- **Migration**: One-time operation, safe for large projects
## Next Steps
Users can now:
1. Run `codex-lens migrate` to upgrade existing indexes
2. Use `codex-lens search "query" --mode hybrid` for best results
3. Check `codex-lens status` to verify enabled features
4. Tune fusion weights for their use case via `--weights`

View File

@@ -1,459 +0,0 @@
MCP integration
mcp_servers
You can configure Codex to use MCP servers to give Codex access to external applications, resources, or services.
Server configuration
STDIO
STDIO servers are MCP servers that you can launch directly via commands on your computer.
# The top-level table name must be `mcp_servers`
# The sub-table name (`server-name` in this example) can be anything you would like.
[mcp_servers.server_name]
command = "npx"
# Optional
args = ["-y", "mcp-server"]
# Optional: propagate additional env vars to the MCP server.
# A default whitelist of env vars will be propagated to the MCP server.
# https://github.com/openai/codex/blob/main/codex-rs/rmcp-client/src/utils.rs#L82
env = { "API_KEY" = "value" }
# or
[mcp_servers.server_name.env]
API_KEY = "value"
# Optional: Additional list of environment variables that will be whitelisted in the MCP server's environment.
env_vars = ["API_KEY2"]
# Optional: cwd that the command will be run from
cwd = "/Users/<user>/code/my-server"
Streamable HTTP
Streamable HTTP servers enable Codex to talk to resources that are accessed via a http url (either on localhost or another domain).
[mcp_servers.figma]
url = "https://mcp.figma.com/mcp"
# Optional environment variable containing a bearer token to use for auth
bearer_token_env_var = "ENV_VAR"
# Optional map of headers with hard-coded values.
http_headers = { "HEADER_NAME" = "HEADER_VALUE" }
# Optional map of headers whose values will be replaced with the environment variable.
env_http_headers = { "HEADER_NAME" = "ENV_VAR" }
Streamable HTTP connections always use the experimental Rust MCP client under the hood, so expect occasional rough edges. OAuth login flows are gated on the rmcp_client = true flag:
[features]
rmcp_client = true
After enabling it, run codex mcp login <server-name> when the server supports OAuth.
Other configuration options
# Optional: override the default 10s startup timeout
startup_timeout_sec = 20
# Optional: override the default 60s per-tool timeout
tool_timeout_sec = 30
# Optional: disable a server without removing it
enabled = false
# Optional: only expose a subset of tools from this server
enabled_tools = ["search", "summarize"]
# Optional: hide specific tools (applied after `enabled_tools`, if set)
disabled_tools = ["search"]
When both enabled_tools and disabled_tools are specified, Codex first restricts the server to the allow-list and then removes any tools that appear in the deny-list.
MCP CLI commands
# List all available commands
codex mcp --help
# Add a server (env can be repeated; `--` separates the launcher command)
codex mcp add docs -- docs-server --port 4000
# List configured servers (pretty table or JSON)
codex mcp list
codex mcp list --json
# Show one server (table or JSON)
codex mcp get docs
codex mcp get docs --json
# Remove a server
codex mcp remove docs
# Log in to a streamable HTTP server that supports oauth
codex mcp login SERVER_NAME
# Log out from a streamable HTTP server that supports oauth
codex mcp logout SERVER_NAME
Examples of useful MCPs
There is an ever growing list of useful MCP servers that can be helpful while you are working with Codex.
Some of the most common MCPs we've seen are:
Context7 — connect to a wide range of up-to-date developer documentation
Figma Local and Remote - access to your Figma designs
Playwright - control and inspect a browser using Playwright
Chrome Developer Tools — control and inspect a Chrome browser
Sentry — access to your Sentry logs
GitHub — Control over your GitHub account beyond what git allows (like controlling PRs, issues, etc.)
# Example config.toml
Use this example configuration as a starting point. For an explanation of each field and additional context, see [Configuration](./config.md). Copy the snippet below to `~/.codex/config.toml` and adjust values as needed.
```toml
# Codex example configuration (config.toml)
#
# This file lists all keys Codex reads from config.toml, their default values,
# and concise explanations. Values here mirror the effective defaults compiled
# into the CLI. Adjust as needed.
#
# Notes
# - Root keys must appear before tables in TOML.
# - Optional keys that default to "unset" are shown commented out with notes.
# - MCP servers, profiles, and model providers are examples; remove or edit.
################################################################################
# Core Model Selection
################################################################################
# Primary model used by Codex. Default: "gpt-5.1-codex-max" on all platforms.
model = "gpt-5.1-codex-max"
# Model used by the /review feature (code reviews). Default: "gpt-5.1-codex-max".
review_model = "gpt-5.1-codex-max"
# Provider id selected from [model_providers]. Default: "openai".
model_provider = "openai"
# Optional manual model metadata. When unset, Codex auto-detects from model.
# Uncomment to force values.
# model_context_window = 128000 # tokens; default: auto for model
# model_auto_compact_token_limit = 0 # disable/override auto; default: model family specific
# tool_output_token_limit = 10000 # tokens stored per tool output; default: 10000 for gpt-5.1-codex-max
################################################################################
# Reasoning & Verbosity (Responses API capable models)
################################################################################
# Reasoning effort: minimal | low | medium | high | xhigh (default: medium; xhigh on gpt-5.1-codex-max and gpt-5.2)
model_reasoning_effort = "medium"
# Reasoning summary: auto | concise | detailed | none (default: auto)
model_reasoning_summary = "auto"
# Text verbosity for GPT-5 family (Responses API): low | medium | high (default: medium)
model_verbosity = "medium"
# Force-enable reasoning summaries for current model (default: false)
model_supports_reasoning_summaries = false
# Force reasoning summary format: none | experimental (default: none)
model_reasoning_summary_format = "none"
################################################################################
# Instruction Overrides
################################################################################
# Additional user instructions appended after AGENTS.md. Default: unset.
# developer_instructions = ""
# Optional legacy base instructions override (prefer AGENTS.md). Default: unset.
# instructions = ""
# Inline override for the history compaction prompt. Default: unset.
# compact_prompt = ""
# Override built-in base instructions with a file path. Default: unset.
# experimental_instructions_file = "/absolute/or/relative/path/to/instructions.txt"
# Load the compact prompt override from a file. Default: unset.
# experimental_compact_prompt_file = "/absolute/or/relative/path/to/compact_prompt.txt"
################################################################################
# Approval & Sandbox
################################################################################
# When to ask for command approval:
# - untrusted: only known-safe read-only commands auto-run; others prompt
# - on-failure: auto-run in sandbox; prompt only on failure for escalation
# - on-request: model decides when to ask (default)
# - never: never prompt (risky)
approval_policy = "on-request"
# Filesystem/network sandbox policy for tool calls:
# - read-only (default)
# - workspace-write
# - danger-full-access (no sandbox; extremely risky)
sandbox_mode = "read-only"
# Extra settings used only when sandbox_mode = "workspace-write".
[sandbox_workspace_write]
# Additional writable roots beyond the workspace (cwd). Default: []
writable_roots = []
# Allow outbound network access inside the sandbox. Default: false
network_access = false
# Exclude $TMPDIR from writable roots. Default: false
exclude_tmpdir_env_var = false
# Exclude /tmp from writable roots. Default: false
exclude_slash_tmp = false
################################################################################
# Shell Environment Policy for spawned processes
################################################################################
[shell_environment_policy]
# inherit: all (default) | core | none
inherit = "all"
# Skip default excludes for names containing KEY/TOKEN (case-insensitive). Default: false
ignore_default_excludes = false
# Case-insensitive glob patterns to remove (e.g., "AWS_*", "AZURE_*"). Default: []
exclude = []
# Explicit key/value overrides (always win). Default: {}
set = {}
# Whitelist; if non-empty, keep only matching vars. Default: []
include_only = []
# Experimental: run via user shell profile. Default: false
experimental_use_profile = false
################################################################################
# History & File Opener
################################################################################
[history]
# save-all (default) | none
persistence = "save-all"
# Maximum bytes for history file; oldest entries are trimmed when exceeded. Example: 5242880
# max_bytes = 0
# URI scheme for clickable citations: vscode (default) | vscode-insiders | windsurf | cursor | none
file_opener = "vscode"
################################################################################
# UI, Notifications, and Misc
################################################################################
[tui]
# Desktop notifications from the TUI: boolean or filtered list. Default: true
# Examples: false | ["agent-turn-complete", "approval-requested"]
notifications = false
# Enables welcome/status/spinner animations. Default: true
animations = true
# Suppress internal reasoning events from output. Default: false
hide_agent_reasoning = false
# Show raw reasoning content when available. Default: false
show_raw_agent_reasoning = false
# Disable burst-paste detection in the TUI. Default: false
disable_paste_burst = false
# Track Windows onboarding acknowledgement (Windows only). Default: false
windows_wsl_setup_acknowledged = false
# External notifier program (argv array). When unset: disabled.
# Example: notify = ["notify-send", "Codex"]
# notify = [ ]
# In-product notices (mostly set automatically by Codex).
[notice]
# hide_full_access_warning = true
# hide_rate_limit_model_nudge = true
################################################################################
# Authentication & Login
################################################################################
# Where to persist CLI login credentials: file (default) | keyring | auto
cli_auth_credentials_store = "file"
# Base URL for ChatGPT auth flow (not OpenAI API). Default:
chatgpt_base_url = "https://chatgpt.com/backend-api/"
# Restrict ChatGPT login to a specific workspace id. Default: unset.
# forced_chatgpt_workspace_id = ""
# Force login mechanism when Codex would normally auto-select. Default: unset.
# Allowed values: chatgpt | api
# forced_login_method = "chatgpt"
# Preferred store for MCP OAuth credentials: auto (default) | file | keyring
mcp_oauth_credentials_store = "auto"
################################################################################
# Project Documentation Controls
################################################################################
# Max bytes from AGENTS.md to embed into first-turn instructions. Default: 32768
project_doc_max_bytes = 32768
# Ordered fallbacks when AGENTS.md is missing at a directory level. Default: []
project_doc_fallback_filenames = []
################################################################################
# Tools (legacy toggles kept for compatibility)
################################################################################
[tools]
# Enable web search tool (alias: web_search_request). Default: false
web_search = false
# Enable the view_image tool so the agent can attach local images. Default: true
view_image = true
# (Alias accepted) You can also write:
# web_search_request = false
################################################################################
# Centralized Feature Flags (preferred)
################################################################################
[features]
# Leave this table empty to accept defaults. Set explicit booleans to opt in/out.
unified_exec = false
rmcp_client = false
apply_patch_freeform = false
view_image_tool = true
web_search_request = false
ghost_commit = false
enable_experimental_windows_sandbox = false
skills = false
################################################################################
# Experimental toggles (legacy; prefer [features])
################################################################################
# Include apply_patch via freeform editing path (affects default tool set). Default: false
experimental_use_freeform_apply_patch = false
# Define MCP servers under this table. Leave empty to disable.
[mcp_servers]
# --- Example: STDIO transport ---
# [mcp_servers.docs]
# command = "docs-server" # required
# args = ["--port", "4000"] # optional
# env = { "API_KEY" = "value" } # optional key/value pairs copied as-is
# env_vars = ["ANOTHER_SECRET"] # optional: forward these from the parent env
# cwd = "/path/to/server" # optional working directory override
# startup_timeout_sec = 10.0 # optional; default 10.0 seconds
# # startup_timeout_ms = 10000 # optional alias for startup timeout (milliseconds)
# tool_timeout_sec = 60.0 # optional; default 60.0 seconds
# enabled_tools = ["search", "summarize"] # optional allow-list
# disabled_tools = ["slow-tool"] # optional deny-list (applied after allow-list)
# --- Example: Streamable HTTP transport ---
# [mcp_servers.github]
# url = "https://github-mcp.example.com/mcp" # required
# bearer_token_env_var = "GITHUB_TOKEN" # optional; Authorization: Bearer <token>
# http_headers = { "X-Example" = "value" } # optional static headers
# env_http_headers = { "X-Auth" = "AUTH_ENV" } # optional headers populated from env vars
# startup_timeout_sec = 10.0 # optional
# tool_timeout_sec = 60.0 # optional
# enabled_tools = ["list_issues"] # optional allow-list
################################################################################
# Model Providers (extend/override built-ins)
################################################################################
# Built-ins include:
# - openai (Responses API; requires login or OPENAI_API_KEY via auth flow)
# - oss (Chat Completions API; defaults to http://localhost:11434/v1)
[model_providers]
# --- Example: override OpenAI with explicit base URL or headers ---
# [model_providers.openai]
# name = "OpenAI"
# base_url = "https://api.openai.com/v1" # default if unset
# wire_api = "responses" # "responses" | "chat" (default varies)
# # requires_openai_auth = true # built-in OpenAI defaults to true
# # request_max_retries = 4 # default 4; max 100
# # stream_max_retries = 5 # default 5; max 100
# # stream_idle_timeout_ms = 300000 # default 300_000 (5m)
# # experimental_bearer_token = "sk-example" # optional dev-only direct bearer token
# # http_headers = { "X-Example" = "value" }
# # env_http_headers = { "OpenAI-Organization" = "OPENAI_ORGANIZATION", "OpenAI-Project" = "OPENAI_PROJECT" }
# --- Example: Azure (Chat/Responses depending on endpoint) ---
# [model_providers.azure]
# name = "Azure"
# base_url = "https://YOUR_PROJECT_NAME.openai.azure.com/openai"
# wire_api = "responses" # or "chat" per endpoint
# query_params = { api-version = "2025-04-01-preview" }
# env_key = "AZURE_OPENAI_API_KEY"
# # env_key_instructions = "Set AZURE_OPENAI_API_KEY in your environment"
# --- Example: Local OSS (e.g., Ollama-compatible) ---
# [model_providers.ollama]
# name = "Ollama"
# base_url = "http://localhost:11434/v1"
# wire_api = "chat"
################################################################################
# Profiles (named presets)
################################################################################
# Active profile name. When unset, no profile is applied.
# profile = "default"
[profiles]
# [profiles.default]
# model = "gpt-5.1-codex-max"
# model_provider = "openai"
# approval_policy = "on-request"
# sandbox_mode = "read-only"
# model_reasoning_effort = "medium"
# model_reasoning_summary = "auto"
# model_verbosity = "medium"
# chatgpt_base_url = "https://chatgpt.com/backend-api/"
# experimental_compact_prompt_file = "compact_prompt.txt"
# include_apply_patch_tool = false
# experimental_use_freeform_apply_patch = false
# tools_web_search = false
# tools_view_image = true
# features = { unified_exec = false }
################################################################################
# Projects (trust levels)
################################################################################
# Mark specific worktrees as trusted. Only "trusted" is recognized.
[projects]
# [projects."/absolute/path/to/project"]
# trust_level = "trusted"
################################################################################
# OpenTelemetry (OTEL) disabled by default
################################################################################
[otel]
# Include user prompt text in logs. Default: false
log_user_prompt = false
# Environment label applied to telemetry. Default: "dev"
environment = "dev"
# Exporter: none (default) | otlp-http | otlp-grpc
exporter = "none"
# Example OTLP/HTTP exporter configuration
# [otel.exporter."otlp-http"]
# endpoint = "https://otel.example.com/v1/logs"
# protocol = "binary" # "binary" | "json"
# [otel.exporter."otlp-http".headers]
# "x-otlp-api-key" = "${OTLP_TOKEN}"
# Example OTLP/gRPC exporter configuration
# [otel.exporter."otlp-grpc"]
# endpoint = "https://otel.example.com:4317",
# headers = { "x-otlp-meta" = "abc123" }
# Example OTLP exporter with mutual TLS
# [otel.exporter."otlp-http"]
# endpoint = "https://otel.example.com/v1/logs"
# protocol = "binary"
# [otel.exporter."otlp-http".headers]
# "x-otlp-api-key" = "${OTLP_TOKEN}"
# [otel.exporter."otlp-http".tls]
# ca-certificate = "certs/otel-ca.pem"
# client-certificate = "/etc/codex/certs/client.pem"
# client-private-key = "/etc/codex/certs/client-key.pem"
```

View File

@@ -1,187 +0,0 @@
# Test Quality Enhancements - Implementation Summary
**Date**: 2025-12-16
**Status**: ✅ Complete - All 4 recommendations implemented and passing
## Overview
Implemented all 4 test quality recommendations from Gemini's comprehensive analysis to enhance test coverage and robustness across the codex-lens test suite.
## Recommendation 1: Verify True Fuzzy Matching ✅
**File**: `tests/test_dual_fts.py`
**Test Class**: `TestDualFTSPerformance`
**New Test**: `test_fuzzy_substring_matching`
### Implementation
- Verifies trigram tokenizer enables partial token matching
- Tests that searching for "func" matches "function0", "function1", etc.
- Gracefully skips if trigram tokenizer unavailable
- Validates BM25 scoring for fuzzy results
### Key Features
- Runtime detection of trigram support
- Validates substring matching capability
- Ensures proper score ordering (negative BM25)
### Test Result
```bash
PASSED tests/test_dual_fts.py::TestDualFTSPerformance::test_fuzzy_substring_matching
```
---
## Recommendation 2: Enable Mocked Vector Search ✅
**File**: `tests/test_hybrid_search_e2e.py`
**Test Class**: `TestHybridSearchWithVectorMock`
**New Test**: `test_hybrid_with_vector_enabled`
### Implementation
- Mocks vector search to return predefined results
- Tests RRF fusion with exact + fuzzy + vector sources
- Validates hybrid search handles vector integration correctly
- Uses `unittest.mock.patch` for clean mocking
### Key Features
- Mock SearchResult objects with scores
- Tests enable_vector=True parameter
- Validates RRF fusion score calculation (positive scores)
- Gracefully handles missing vector search module
### Test Result
```bash
PASSED tests/test_hybrid_search_e2e.py::TestHybridSearchWithVectorMock::test_hybrid_with_vector_enabled
```
---
## Recommendation 3: Complex Query Parser Stress Tests ✅
**File**: `tests/test_query_parser.py`
**Test Class**: `TestComplexBooleanQueries`
**New Tests**: 5 comprehensive tests
### Implementation
#### 1. `test_nested_boolean_and_or`
- Tests: `(login OR logout) AND user`
- Validates nested parentheses preservation
- Ensures boolean operators remain intact
#### 2. `test_mixed_operators_with_expansion`
- Tests: `UserAuth AND (login OR logout)`
- Verifies CamelCase expansion doesn't break operators
- Ensures expansion + boolean logic coexist
#### 3. `test_quoted_phrases_with_boolean`
- Tests: `"user authentication" AND login`
- Validates quoted phrase preservation
- Ensures AND operator survives
#### 4. `test_not_operator_preservation`
- Tests: `login NOT logout`
- Confirms NOT operator handling
- Validates negation logic
#### 5. `test_complex_nested_three_levels`
- Tests: `((UserAuth OR login) AND session) OR token`
- Stress tests deep nesting (3 levels)
- Validates multiple parentheses pairs
### Test Results
```bash
PASSED tests/test_query_parser.py::TestComplexBooleanQueries::test_nested_boolean_and_or
PASSED tests/test_query_parser.py::TestComplexBooleanQueries::test_mixed_operators_with_expansion
PASSED tests/test_query_parser.py::TestComplexBooleanQueries::test_quoted_phrases_with_boolean
PASSED tests/test_query_parser.py::TestComplexBooleanQueries::test_not_operator_preservation
PASSED tests/test_query_parser.py::TestComplexBooleanQueries::test_complex_nested_three_levels
```
---
## Recommendation 4: Migration Reversibility Tests ✅
**File**: `tests/test_dual_fts.py`
**Test Class**: `TestMigrationRecovery`
**New Tests**: 2 migration robustness tests
### Implementation
#### 1. `test_migration_preserves_data_on_failure`
- Creates v2 database with test data
- Attempts migration (may succeed or fail)
- Validates data preservation in both scenarios
- Smart column detection (path vs full_path)
**Key Features**:
- Checks schema version to determine column names
- Handles both migration success and failure
- Ensures no data loss
#### 2. `test_migration_idempotent_after_partial_failure`
- Tests retry capability after partial migration
- Validates graceful handling of repeated initialization
- Ensures database remains in usable state
**Key Features**:
- Double initialization without errors
- Table existence verification
- Safe retry mechanism
### Test Results
```bash
PASSED tests/test_dual_fts.py::TestMigrationRecovery::test_migration_preserves_data_on_failure
PASSED tests/test_dual_fts.py::TestMigrationRecovery::test_migration_idempotent_after_partial_failure
```
---
## Test Suite Statistics
### Overall Results
```
91 passed, 2 skipped, 2 warnings in 3.31s
```
### New Tests Added
- **Recommendation 1**: 1 test (fuzzy substring matching)
- **Recommendation 2**: 1 test (vector mock integration)
- **Recommendation 3**: 5 tests (complex boolean queries)
- **Recommendation 4**: 2 tests (migration recovery)
**Total New Tests**: 9
### Coverage Improvements
- **Fuzzy Search**: Now validates actual trigram substring matching
- **Hybrid Search**: Tests vector integration with mocks
- **Query Parser**: Handles complex nested boolean logic
- **Migration**: Validates data preservation and retry capability
---
## Code Quality
### Best Practices Applied
1. **Graceful Degradation**: Tests skip when features unavailable (trigram)
2. **Clean Mocking**: Uses `unittest.mock` for vector search
3. **Smart Assertions**: Adapts to migration outcomes dynamically
4. **Edge Case Handling**: Tests multiple nesting levels and operators
### Integration
- All tests integrate seamlessly with existing pytest fixtures
- Maintains 100% pass rate across test suite
- No breaking changes to existing tests
---
## Validation
All 4 recommendations successfully implemented and verified:
**Recommendation 1**: Fuzzy substring matching with trigram validation
**Recommendation 2**: Vector search mocking for hybrid fusion testing
**Recommendation 3**: Complex boolean query stress tests (5 tests)
**Recommendation 4**: Migration recovery and idempotency tests (2 tests)
**Final Status**: Production-ready, all tests passing

View File

@@ -1,156 +0,0 @@
"""Demo script for association tree building.
This script demonstrates how to use the AssociationTreeBuilder and
ResultDeduplicator to explore code relationships via LSP call hierarchy.
"""
import asyncio
import sys
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from codexlens.lsp.standalone_manager import StandaloneLspManager
from codexlens.search.association_tree import (
AssociationTreeBuilder,
ResultDeduplicator,
)
async def demo_simple_tree():
"""Build a simple call tree from a Python file."""
print("=" * 70)
print("Association Tree Demo")
print("=" * 70)
print()
# Use this file as the test subject
test_file = Path(__file__).resolve()
workspace_root = test_file.parent.parent
print(f"Workspace: {workspace_root}")
print(f"Test file: {test_file.name}")
print()
# Initialize LSP manager
async with StandaloneLspManager(
workspace_root=str(workspace_root),
timeout=10.0,
) as lsp:
print("LSP manager initialized")
print()
# Create tree builder
builder = AssociationTreeBuilder(lsp, timeout=5.0)
# Build tree from a function in this file
# Using line 50 as an example (adjust based on actual file)
print(f"Building call tree from {test_file.name}:50...")
tree = await builder.build_tree(
seed_file_path=str(test_file),
seed_line=50,
seed_character=1,
max_depth=3,
expand_callers=True,
expand_callees=True,
)
print(f"Tree built: {tree}")
print(f" Roots: {len(tree.roots)}")
print(f" Total unique nodes: {len(tree.all_nodes)}")
print(f" Total node instances: {len(tree.node_list)}")
print(f" Edges: {len(tree.edges)}")
print()
if tree.roots:
print("Root nodes:")
for root in tree.roots:
print(f" - {root.item.name} ({root.item.kind})")
print(f" {root.item.file_path}:{root.item.range.start_line}")
print()
# Deduplicate and score
print("Deduplicating and scoring nodes...")
deduplicator = ResultDeduplicator(
depth_weight=0.4,
frequency_weight=0.3,
kind_weight=0.3,
)
unique_nodes = deduplicator.deduplicate(tree, max_results=20)
print(f"Found {len(unique_nodes)} unique nodes")
print()
if unique_nodes:
print("Top 10 nodes by score:")
print("-" * 70)
for i, node in enumerate(unique_nodes[:10], 1):
print(f"{i:2}. {node.name} ({node.kind})")
print(f" Location: {Path(node.file_path).name}:{node.range.start_line}")
print(
f" Depth: {node.min_depth}, "
f"Occurrences: {node.occurrences}, "
f"Score: {node.score:.3f}"
)
if node.paths:
print(f" Paths: {len(node.paths)}")
print()
# Show filtering capabilities
functions = deduplicator.filter_by_kind(
unique_nodes, ["function", "method"]
)
print(f"Functions/methods only: {len(functions)} nodes")
if functions:
print("Top 5 functions:")
for i, node in enumerate(functions[:5], 1):
print(f" {i}. {node.name} (score: {node.score:.3f})")
else:
print("No nodes found. Try a different seed location.")
print()
print("Demo complete!")
async def demo_cycle_detection():
"""Demonstrate cycle detection in call trees."""
print("\n" + "=" * 70)
print("Cycle Detection Demo")
print("=" * 70)
print()
# Create a simple Python file with circular calls for testing
test_code = '''
def func_a():
"""Function A calls B."""
func_b()
def func_b():
"""Function B calls A (creates a cycle)."""
func_a()
'''
print("This demo would detect cycles in:")
print(test_code)
print("The tree builder automatically marks cycle nodes to prevent infinite expansion.")
def main():
"""Run the demo."""
try:
asyncio.run(demo_simple_tree())
demo_cycle_detection()
except KeyboardInterrupt:
print("\nDemo interrupted by user")
except Exception as e:
print(f"\nError running demo: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()

View File

@@ -1,40 +0,0 @@
"""Debug URI format issues."""
import asyncio
from pathlib import Path
from urllib.parse import quote
def test_uri_formats():
"""Compare different URI formats."""
file_path = Path("D:/Claude_dms3/codex-lens/test_simple_function.py")
print("URI Format Comparison")
print("="*80)
# Method 1: Path.as_uri()
uri1 = file_path.resolve().as_uri()
print(f"1. Path.as_uri(): {uri1}")
# Method 2: Manual construction
uri2 = f"file:///{str(file_path.resolve()).replace(chr(92), '/')}"
print(f"2. Manual (forward /): {uri2}")
# Method 3: With quote
path_str = str(file_path.resolve()).replace(chr(92), '/')
uri3 = f"file:///{quote(path_str, safe='/:')}"
print(f"3. With quote: {uri3}")
# Method 4: Lowercase drive
path_lower = str(file_path.resolve()).replace(chr(92), '/')
if len(path_lower) > 1 and path_lower[1] == ':':
path_lower = path_lower[0].lower() + path_lower[1:]
uri4 = f"file:///{path_lower}"
print(f"4. Lowercase drive: {uri4}")
# What Pyright shows in logs
print(f"\n5. Pyright log format: file:///d%3A/Claude_dms3/codex-lens/...")
return uri1, uri4
if __name__ == "__main__":
test_uri_formats()

View File

@@ -1,326 +0,0 @@
"""Search method comparison benchmark.
Compares different search strategies:
1. Pure FTS (exact + fuzzy matching)
2. Pure Vector (semantic search only)
3. Hybrid Fusion (FTS + Vector with RRF)
4. Vector + LSP Association Tree (new strategy)
Usage:
python examples/search_comparison_benchmark.py
"""
from __future__ import annotations
import asyncio
import time
from pathlib import Path
from typing import List, Dict, Any
from codexlens.config import Config
from codexlens.entities import SearchResult
from codexlens.search.hybrid_search import HybridSearchEngine
from codexlens.lsp.standalone_manager import StandaloneLspManager
from codexlens.search.association_tree import AssociationTreeBuilder, ResultDeduplicator
class SearchBenchmark:
"""Benchmark different search strategies."""
def __init__(self, index_path: Path, config: Config):
"""Initialize benchmark.
Args:
index_path: Path to _index.db file
config: CodexLens config
"""
self.index_path = index_path
self.config = config
self.engine = HybridSearchEngine(config=config)
self.lsp_manager: StandaloneLspManager | None = None
self.tree_builder: AssociationTreeBuilder | None = None
self.deduplicator = ResultDeduplicator(
depth_weight=0.4,
frequency_weight=0.3,
kind_weight=0.3,
max_depth_penalty=10,
)
async def setup_lsp(self):
"""Setup LSP manager for association tree search."""
self.lsp_manager = StandaloneLspManager(
workspace_root=str(self.index_path.parent),
timeout=5.0,
)
await self.lsp_manager.start()
self.tree_builder = AssociationTreeBuilder(
lsp_manager=self.lsp_manager,
timeout=5.0,
)
async def cleanup_lsp(self):
"""Cleanup LSP manager."""
if self.lsp_manager:
await self.lsp_manager.stop()
def method1_pure_fts(self, query: str, limit: int = 20) -> tuple[List[SearchResult], float]:
"""Method 1: Pure FTS (exact + fuzzy)."""
start = time.perf_counter()
results = self.engine.search(
index_path=self.index_path,
query=query,
limit=limit,
enable_fuzzy=True,
enable_vector=False,
pure_vector=False,
)
elapsed = time.perf_counter() - start
return results, elapsed
def method2_pure_vector(self, query: str, limit: int = 20) -> tuple[List[SearchResult], float]:
"""Method 2: Pure Vector (semantic search only)."""
start = time.perf_counter()
results = self.engine.search(
index_path=self.index_path,
query=query,
limit=limit,
enable_fuzzy=False,
enable_vector=True,
pure_vector=True,
)
elapsed = time.perf_counter() - start
return results, elapsed
def method3_hybrid_fusion(self, query: str, limit: int = 20) -> tuple[List[SearchResult], float]:
"""Method 3: Hybrid Fusion (FTS + Vector with RRF)."""
start = time.perf_counter()
results = self.engine.search(
index_path=self.index_path,
query=query,
limit=limit,
enable_fuzzy=True,
enable_vector=True,
pure_vector=False,
)
elapsed = time.perf_counter() - start
return results, elapsed
async def method4_vector_lsp_tree(
self,
query: str,
limit: int = 20,
max_depth: int = 3,
expand_callers: bool = True,
expand_callees: bool = True,
) -> tuple[List[SearchResult], float, Dict[str, Any]]:
"""Method 4: Vector + LSP Association Tree (new strategy).
Steps:
1. Vector search to find seed results (top 5-10)
2. For each seed, build LSP association tree
3. Deduplicate and score all discovered nodes
4. Return top N results
Args:
query: Search query
limit: Final result limit
max_depth: Maximum depth for LSP tree expansion
expand_callers: Whether to expand incoming calls
expand_callees: Whether to expand outgoing calls
Returns:
Tuple of (results, elapsed_time, stats)
"""
if not self.tree_builder:
raise RuntimeError("LSP not initialized. Call setup_lsp() first.")
start = time.perf_counter()
stats = {
"seed_count": 0,
"trees_built": 0,
"total_tree_nodes": 0,
"unique_nodes": 0,
"dedup_time_ms": 0,
}
# Step 1: Get seed results from vector search (top 10)
seed_results = self.engine.search(
index_path=self.index_path,
query=query,
limit=10,
enable_fuzzy=False,
enable_vector=True,
pure_vector=True,
)
stats["seed_count"] = len(seed_results)
if not seed_results:
return [], time.perf_counter() - start, stats
# Step 2: Build association trees for each seed
all_trees = []
for seed in seed_results:
try:
tree = await self.tree_builder.build_tree(
seed_file_path=seed.path,
seed_line=seed.start_line or 1,
seed_character=1,
max_depth=max_depth,
expand_callers=expand_callers,
expand_callees=expand_callees,
)
if tree.node_list:
all_trees.append(tree)
stats["trees_built"] += 1
stats["total_tree_nodes"] += len(tree.node_list)
except Exception as e:
print(f"Error building tree for {seed.path}:{seed.start_line}: {e}")
continue
if not all_trees:
# Fallback to seed results if no trees built
return seed_results[:limit], time.perf_counter() - start, stats
# Step 3: Merge and deduplicate all trees
dedup_start = time.perf_counter()
# Merge all node_lists into a single CallTree
from codexlens.search.association_tree.data_structures import CallTree
merged_tree = CallTree()
for tree in all_trees:
merged_tree.node_list.extend(tree.node_list)
# Deduplicate
unique_nodes = self.deduplicator.deduplicate(
tree=merged_tree,
max_results=limit,
)
stats["unique_nodes"] = len(unique_nodes)
stats["dedup_time_ms"] = (time.perf_counter() - dedup_start) * 1000
# Step 4: Convert UniqueNode to SearchResult
results = []
for node in unique_nodes:
# Use node.score as the search score
result = SearchResult(
path=node.file_path,
score=node.score,
start_line=node.range.start_line,
end_line=node.range.end_line,
symbol_name=node.name,
symbol_kind=node.kind,
content="", # LSP doesn't provide content
metadata={"search_source": "lsp_tree"},
)
results.append(result)
elapsed = time.perf_counter() - start
return results, elapsed, stats
def print_results(self, method_name: str, results: List[SearchResult], elapsed: float, stats: Dict[str, Any] | None = None):
"""Print benchmark results."""
print(f"\n{'='*80}")
print(f"Method: {method_name}")
print(f"{'='*80}")
print(f"Time: {elapsed*1000:.2f}ms")
print(f"Results: {len(results)}")
if stats:
print(f"\nStats:")
for key, value in stats.items():
print(f" {key}: {value}")
print(f"\nTop 5 Results:")
for i, result in enumerate(results[:5], 1):
print(f"{i}. [{result.score:.4f}] {result.path}:{result.start_line}")
if result.symbol_name:
print(f" Name: {result.symbol_name}")
if result.metadata.get("search_source"):
print(f" Source: {result.metadata.get('search_source')}")
async def run_comparison(self, query: str, limit: int = 20):
"""Run comparison for a single query."""
print(f"\n{'#'*80}")
print(f"Query: {query}")
print(f"{'#'*80}")
# Method 1: Pure FTS
results1, time1 = self.method1_pure_fts(query, limit)
self.print_results("Method 1: Pure FTS", results1, time1)
# Method 2: Pure Vector
results2, time2 = self.method2_pure_vector(query, limit)
self.print_results("Method 2: Pure Vector", results2, time2)
# Method 3: Hybrid Fusion
results3, time3 = self.method3_hybrid_fusion(query, limit)
self.print_results("Method 3: Hybrid Fusion (FTS+Vector)", results3, time3)
# Method 4: Vector + LSP Tree (requires LSP setup)
results4 = None
time4 = 0.0
try:
results4, time4, stats4 = await self.method4_vector_lsp_tree(query, limit, max_depth=3)
self.print_results("Method 4: Vector + LSP Association Tree", results4, time4, stats4)
except Exception as e:
print(f"\nMethod 4: Vector + LSP Association Tree")
print(f"Error: {e}")
# Comparison summary
print(f"\n{'='*80}")
print(f"Summary")
print(f"{'='*80}")
print(f"Method 1 (FTS): {time1*1000:8.2f}ms {len(results1):3d} results")
print(f"Method 2 (Vector): {time2*1000:8.2f}ms {len(results2):3d} results")
print(f"Method 3 (Hybrid): {time3*1000:8.2f}ms {len(results3):3d} results")
if results4 is not None:
print(f"Method 4 (Vector+LSP): {time4*1000:8.2f}ms {len(results4):3d} results")
async def main():
"""Main benchmark entry point."""
# Setup - use the actual index path from ~/.codexlens/indexes/
import os
codexlens_home = Path(os.path.expanduser("~/.codexlens"))
index_path = codexlens_home / "indexes/D/Claude_dms3/codex-lens/src/codexlens/_index.db"
if not index_path.exists():
print(f"Error: Index not found at {index_path}")
print("Please run: python -m codexlens index init src")
return
project_root = Path("D:/Claude_dms3/codex-lens/src")
config = Config()
benchmark = SearchBenchmark(index_path, config)
# Test queries
queries = [
"vector search implementation",
"LSP call hierarchy",
"search result ranking",
"index building",
]
# Setup LSP for Method 4
print("Setting up LSP manager...")
try:
await benchmark.setup_lsp()
print("LSP manager ready")
except Exception as e:
print(f"Warning: Could not setup LSP: {e}")
print("Method 4 will be skipped")
try:
# Run benchmarks
for query in queries:
await benchmark.run_comparison(query, limit=20)
finally:
# Cleanup
await benchmark.cleanup_lsp()
print("\nBenchmark complete")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,123 +0,0 @@
"""Simple search method comparison using CLI commands.
Compares:
1. FTS (Full-Text Search)
2. Semantic (Dense + Rerank)
3. Hybrid (Future: FTS + Semantic fusion)
Usage:
python examples/simple_search_comparison.py
"""
import subprocess
import time
import json
import re
import os
from pathlib import Path
def strip_ansi(text: str) -> str:
"""Remove ANSI color codes from text."""
ansi_escape = re.compile(r'\x1b\[[0-9;]*m')
return ansi_escape.sub('', text)
def run_search(query: str, method: str, limit: int = 20) -> tuple[list, float]:
"""Run search via CLI and measure time."""
cmd = [
"python", "-m", "codexlens", "search",
query,
"--method", method,
"--limit", str(limit),
"--json",
"-p", "."
]
start = time.perf_counter()
result = subprocess.run(
cmd,
cwd=str(Path("D:/Claude_dms3/codex-lens/src")),
capture_output=True,
text=True,
env={**os.environ, "NO_COLOR": "1"}, # Try to disable colors
)
elapsed = time.perf_counter() - start
if result.returncode != 0:
print(f"Error running {method} search:")
print(result.stderr[:200])
return [], elapsed
try:
# Strip ANSI codes and parse JSON
clean_output = strip_ansi(result.stdout)
data = json.loads(clean_output)
# Results are nested in "result" object
if "result" in data and "results" in data["result"]:
return data["result"]["results"], elapsed
return data.get("results", []), elapsed
except json.JSONDecodeError as e:
print(f"Failed to parse JSON output for {method}: {e}")
return [], elapsed
def print_comparison(query: str):
"""Print comparison for a single query."""
print(f"\n{'='*80}")
print(f"Query: {query}")
print(f"{'='*80}\n")
# Method 1: FTS
print("Method 1: FTS (Full-Text Search)")
results_fts, time_fts = run_search(query, "fts", 20)
print(f" Time: {time_fts*1000:.2f}ms")
print(f" Results: {len(results_fts)}")
if results_fts:
print(f" Top 3:")
for i, r in enumerate(results_fts[:3], 1):
path = r.get("path", "").replace("D:\\Claude_dms3\\codex-lens\\src\\", "")
score = r.get("score", 0)
print(f" {i}. [{score:.4f}] {path}")
print()
# Method 2: Semantic (Dense + Rerank)
print("Method 2: Semantic (Dense + Rerank)")
results_semantic, time_semantic = run_search(query, "dense_rerank", 20)
print(f" Time: {time_semantic*1000:.2f}ms")
print(f" Results: {len(results_semantic)}")
if results_semantic:
print(f" Top 3:")
for i, r in enumerate(results_semantic[:3], 1):
path = r.get("path", "").replace("D:\\Claude_dms3\\codex-lens\\src\\", "")
score = r.get("score", 0)
print(f" {i}. [{score:.4f}] {path}")
print()
# Summary
print(f"Summary:")
print(f" FTS: {time_fts*1000:8.2f}ms {len(results_fts):3d} results")
print(f" Semantic: {time_semantic*1000:8.2f}ms {len(results_semantic):3d} results")
print(f" Speedup: {time_semantic/time_fts:6.2f}x (FTS faster)")
def main():
"""Main comparison entry point."""
queries = [
"vector search",
"LSP call hierarchy",
"search ranking",
"index building",
]
print("Search Method Comparison")
print("=" * 80)
for query in queries:
print_comparison(query)
print(f"\n{'='*80}")
print("Comparison complete")
print(f"{'='*80}")
if __name__ == "__main__":
main()

View File

@@ -1,79 +0,0 @@
"""Test LSP server capabilities."""
import asyncio
import json
from pathlib import Path
from codexlens.lsp.standalone_manager import StandaloneLspManager
async def test_capabilities():
"""Test what capabilities Pyright provides."""
workspace_root = Path("D:/Claude_dms3/codex-lens/src")
print("Testing LSP Capabilities")
print("="*80)
# Create LSP manager
manager = StandaloneLspManager(
workspace_root=str(workspace_root),
timeout=10.0,
)
try:
# Start LSP manager
print("\n1. Starting LSP manager...")
await manager.start()
print(" [OK] LSP manager started")
# Get server state for Python
print("\n2. Getting Python server state...")
test_file = str(workspace_root / "codexlens/search/hybrid_search.py")
state = await manager._get_server(test_file)
if not state:
print(" [ERROR] Could not get server state!")
return
print(f" [OK] Server state obtained")
print(f" Initialized: {state.initialized}")
# Print capabilities
print("\n3. Server Capabilities:")
print("-"*80)
caps = state.capabilities
# Key capabilities to check
important_caps = [
"callHierarchyProvider",
"definitionProvider",
"referencesProvider",
"documentSymbolProvider",
"workspaceSymbolProvider",
"hoverProvider",
"completionProvider",
"signatureHelpProvider",
]
for cap in important_caps:
value = caps.get(cap)
status = "[YES]" if value else "[NO]"
print(f" {status} {cap}: {value}")
# Print all capabilities as JSON for reference
print("\n4. Full capabilities (formatted):")
print("-"*80)
print(json.dumps(caps, indent=2))
except Exception as e:
print(f"\n[ERROR] Error: {e}")
import traceback
traceback.print_exc()
finally:
# Cleanup
print("\n5. Cleaning up...")
await manager.stop()
print(" [OK] LSP manager stopped")
if __name__ == "__main__":
asyncio.run(test_capabilities())

Some files were not shown because too many files have changed in this diff Show More